├── .dir-locals.el ├── .gitignore ├── .pylintrc ├── LICENSE.txt ├── README.org ├── mwscrape ├── __init__.py ├── resolveconflicts.py └── scrape.py └── setup.py /.dir-locals.el: -------------------------------------------------------------------------------- 1 | ((python-mode 2 | (eval . (add-hook 'before-save-hook 'blacken-buffer nil t)))) 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *~ 3 | *.egg-info 4 | __pycache__ 5 | README.html -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS,.git 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=0 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python module names) to load, 30 | # usually to register additional checkers. 31 | load-plugins= 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=print-statement, 64 | parameter-unpacking, 65 | unpacking-in-except, 66 | old-raise-syntax, 67 | backtick, 68 | long-suffix, 69 | old-ne-operator, 70 | old-octal-literal, 71 | import-star-module-level, 72 | non-ascii-bytes-literal, 73 | raw-checker-failed, 74 | bad-inline-option, 75 | locally-disabled, 76 | file-ignored, 77 | suppressed-message, 78 | useless-suppression, 79 | deprecated-pragma, 80 | use-symbolic-message-instead, 81 | apply-builtin, 82 | basestring-builtin, 83 | buffer-builtin, 84 | cmp-builtin, 85 | coerce-builtin, 86 | execfile-builtin, 87 | file-builtin, 88 | long-builtin, 89 | raw_input-builtin, 90 | reduce-builtin, 91 | standarderror-builtin, 92 | unicode-builtin, 93 | xrange-builtin, 94 | coerce-method, 95 | delslice-method, 96 | getslice-method, 97 | setslice-method, 98 | no-absolute-import, 99 | old-division, 100 | dict-iter-method, 101 | dict-view-method, 102 | next-method-called, 103 | metaclass-assignment, 104 | indexing-exception, 105 | raising-string, 106 | reload-builtin, 107 | oct-method, 108 | hex-method, 109 | nonzero-method, 110 | cmp-method, 111 | input-builtin, 112 | round-builtin, 113 | intern-builtin, 114 | unichr-builtin, 115 | map-builtin-not-iterating, 116 | zip-builtin-not-iterating, 117 | range-builtin-not-iterating, 118 | filter-builtin-not-iterating, 119 | using-cmp-argument, 120 | eq-without-hash, 121 | div-method, 122 | idiv-method, 123 | rdiv-method, 124 | exception-message-attribute, 125 | invalid-str-codec, 126 | sys-max-int, 127 | bad-python3-import, 128 | deprecated-string-function, 129 | deprecated-str-translate-call, 130 | deprecated-itertools-function, 131 | deprecated-types-field, 132 | next-method-defined, 133 | dict-items-not-iterating, 134 | dict-keys-not-iterating, 135 | dict-values-not-iterating, 136 | deprecated-operator-function, 137 | deprecated-urllib-function, 138 | xreadlines-attribute, 139 | deprecated-sys-function, 140 | exception-escape, 141 | comprehension-escape, 142 | missing-module-docstring, 143 | missing-function-docstring, 144 | bad-continuation, 145 | broad-except 146 | 147 | # Enable the message, report, category or checker with the given id(s). You can 148 | # either give multiple identifier separated by comma (,) or put this option 149 | # multiple time (only on the command line, not in the configuration file where 150 | # it should appear only once). See also the "--disable" option for examples. 151 | enable=c-extension-no-member 152 | 153 | 154 | [REPORTS] 155 | 156 | # Python expression which should return a score less than or equal to 10. You 157 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 158 | # which contain the number of messages in each category, as well as 'statement' 159 | # which is the total number of statements analyzed. This score is used by the 160 | # global evaluation report (RP0004). 161 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 162 | 163 | # Template used to display messages. This is a python new-style format string 164 | # used to format the message information. See doc for all details. 165 | #msg-template= 166 | 167 | # Set the output format. Available formats are text, parseable, colorized, json 168 | # and msvs (visual studio). You can also give a reporter class, e.g. 169 | # mypackage.mymodule.MyReporterClass. 170 | output-format=text 171 | 172 | # Tells whether to display a full report or only the messages. 173 | reports=no 174 | 175 | # Activate the evaluation score. 176 | score=yes 177 | 178 | 179 | [REFACTORING] 180 | 181 | # Maximum number of nested blocks for function / method body 182 | max-nested-blocks=5 183 | 184 | # Complete name of functions that never returns. When checking for 185 | # inconsistent-return-statements if a never returning function is called then 186 | # it will be considered as an explicit return statement and no message will be 187 | # printed. 188 | never-returning-functions=sys.exit 189 | 190 | 191 | [LOGGING] 192 | 193 | # Format style used to check logging format string. `old` means using % 194 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. 195 | logging-format-style=old 196 | 197 | # Logging modules to check that the string format arguments are in logging 198 | # function parameter format. 199 | logging-modules=logging 200 | 201 | 202 | [SPELLING] 203 | 204 | # Limits count of emitted suggestions for spelling mistakes. 205 | max-spelling-suggestions=4 206 | 207 | # Spelling dictionary name. Available dictionaries: none. To make it work, 208 | # install the python-enchant package. 209 | spelling-dict= 210 | 211 | # List of comma separated words that should not be checked. 212 | spelling-ignore-words= 213 | 214 | # A path to a file that contains the private dictionary; one word per line. 215 | spelling-private-dict-file= 216 | 217 | # Tells whether to store unknown words to the private dictionary (see the 218 | # --spelling-private-dict-file option) instead of raising a message. 219 | spelling-store-unknown-words=no 220 | 221 | 222 | [MISCELLANEOUS] 223 | 224 | # List of note tags to take in consideration, separated by a comma. 225 | notes=FIXME, 226 | XXX, 227 | TODO 228 | 229 | 230 | [TYPECHECK] 231 | 232 | # List of decorators that produce context managers, such as 233 | # contextlib.contextmanager. Add to this list to register other decorators that 234 | # produce valid context managers. 235 | contextmanager-decorators=contextlib.contextmanager 236 | 237 | # List of members which are set dynamically and missed by pylint inference 238 | # system, and so shouldn't trigger E1101 when accessed. Python regular 239 | # expressions are accepted. 240 | generated-members= 241 | 242 | # Tells whether missing members accessed in mixin class should be ignored. A 243 | # mixin class is detected if its name ends with "mixin" (case insensitive). 244 | ignore-mixin-members=yes 245 | 246 | # Tells whether to warn about missing members when the owner of the attribute 247 | # is inferred to be None. 248 | ignore-none=yes 249 | 250 | # This flag controls whether pylint should warn about no-member and similar 251 | # checks whenever an opaque object is returned when inferring. The inference 252 | # can return multiple potential results while evaluating a Python object, but 253 | # some branches might not be evaluated, which results in partial inference. In 254 | # that case, it might be useful to still emit no-member and other checks for 255 | # the rest of the inferred objects. 256 | ignore-on-opaque-inference=yes 257 | 258 | # List of class names for which member attributes should not be checked (useful 259 | # for classes with dynamically set attributes). This supports the use of 260 | # qualified names. 261 | ignored-classes=optparse.Values,thread._local,_thread._local 262 | 263 | # List of module names for which member attributes should not be checked 264 | # (useful for modules/projects where namespaces are manipulated during runtime 265 | # and thus existing member attributes cannot be deduced by static analysis). It 266 | # supports qualified module names, as well as Unix pattern matching. 267 | ignored-modules= 268 | 269 | # Show a hint with possible names when a member name was not found. The aspect 270 | # of finding the hint is based on edit distance. 271 | missing-member-hint=yes 272 | 273 | # The minimum edit distance a name should have in order to be considered a 274 | # similar match for a missing member name. 275 | missing-member-hint-distance=1 276 | 277 | # The total number of similar names that should be taken in consideration when 278 | # showing a hint for a missing member. 279 | missing-member-max-choices=1 280 | 281 | # List of decorators that change the signature of a decorated function. 282 | signature-mutators= 283 | 284 | 285 | [VARIABLES] 286 | 287 | # List of additional names supposed to be defined in builtins. Remember that 288 | # you should avoid defining new builtins when possible. 289 | additional-builtins= 290 | 291 | # Tells whether unused global variables should be treated as a violation. 292 | allow-global-unused-variables=yes 293 | 294 | # List of strings which can identify a callback function by name. A callback 295 | # name must start or end with one of those strings. 296 | callbacks=cb_, 297 | _cb 298 | 299 | # A regular expression matching the name of dummy variables (i.e. expected to 300 | # not be used). 301 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 302 | 303 | # Argument names that match this expression will be ignored. Default to name 304 | # with leading underscore. 305 | ignored-argument-names=_.*|^ignored_|^unused_ 306 | 307 | # Tells whether we should check for unused import in __init__ files. 308 | init-import=no 309 | 310 | # List of qualified module names which can have objects that can redefine 311 | # builtins. 312 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 313 | 314 | 315 | [FORMAT] 316 | 317 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 318 | expected-line-ending-format= 319 | 320 | # Regexp for a line that is allowed to be longer than the limit. 321 | ignore-long-lines=^\s*(# )??$ 322 | 323 | # Number of spaces of indent required inside a hanging or continued line. 324 | indent-after-paren=4 325 | 326 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 327 | # tab). 328 | indent-string=' ' 329 | 330 | # Maximum number of characters on a single line. 331 | max-line-length=100 332 | 333 | # Maximum number of lines in a module. 334 | max-module-lines=1000 335 | 336 | # List of optional constructs for which whitespace checking is disabled. `dict- 337 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 338 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 339 | # `empty-line` allows space-only lines. 340 | no-space-check=trailing-comma, 341 | dict-separator 342 | 343 | # Allow the body of a class to be on the same line as the declaration if body 344 | # contains single statement. 345 | single-line-class-stmt=no 346 | 347 | # Allow the body of an if to be on the same line as the test if there is no 348 | # else. 349 | single-line-if-stmt=no 350 | 351 | 352 | [SIMILARITIES] 353 | 354 | # Ignore comments when computing similarities. 355 | ignore-comments=yes 356 | 357 | # Ignore docstrings when computing similarities. 358 | ignore-docstrings=yes 359 | 360 | # Ignore imports when computing similarities. 361 | ignore-imports=no 362 | 363 | # Minimum lines number of a similarity. 364 | min-similarity-lines=4 365 | 366 | 367 | [BASIC] 368 | 369 | # Naming style matching correct argument names. 370 | argument-naming-style=snake_case 371 | 372 | # Regular expression matching correct argument names. Overrides argument- 373 | # naming-style. 374 | #argument-rgx= 375 | 376 | # Naming style matching correct attribute names. 377 | attr-naming-style=snake_case 378 | 379 | # Regular expression matching correct attribute names. Overrides attr-naming- 380 | # style. 381 | #attr-rgx= 382 | 383 | # Bad variable names which should always be refused, separated by a comma. 384 | bad-names=foo, 385 | bar, 386 | baz, 387 | toto, 388 | tutu, 389 | tata 390 | 391 | # Naming style matching correct class attribute names. 392 | class-attribute-naming-style=any 393 | 394 | # Regular expression matching correct class attribute names. Overrides class- 395 | # attribute-naming-style. 396 | #class-attribute-rgx= 397 | 398 | # Naming style matching correct class names. 399 | class-naming-style=PascalCase 400 | 401 | # Regular expression matching correct class names. Overrides class-naming- 402 | # style. 403 | #class-rgx= 404 | 405 | # Naming style matching correct constant names. 406 | const-naming-style=UPPER_CASE 407 | 408 | # Regular expression matching correct constant names. Overrides const-naming- 409 | # style. 410 | #const-rgx= 411 | 412 | # Minimum line length for functions/classes that require docstrings, shorter 413 | # ones are exempt. 414 | docstring-min-length=-1 415 | 416 | # Naming style matching correct function names. 417 | function-naming-style=snake_case 418 | 419 | # Regular expression matching correct function names. Overrides function- 420 | # naming-style. 421 | #function-rgx= 422 | 423 | # Good variable names which should always be accepted, separated by a comma. 424 | good-names=i, 425 | j, 426 | k, 427 | ex, 428 | Run, 429 | _, 430 | db, 431 | f, 432 | dt, 433 | p 434 | 435 | # Include a hint for the correct naming format with invalid-name. 436 | include-naming-hint=no 437 | 438 | # Naming style matching correct inline iteration names. 439 | inlinevar-naming-style=any 440 | 441 | # Regular expression matching correct inline iteration names. Overrides 442 | # inlinevar-naming-style. 443 | #inlinevar-rgx= 444 | 445 | # Naming style matching correct method names. 446 | method-naming-style=snake_case 447 | 448 | # Regular expression matching correct method names. Overrides method-naming- 449 | # style. 450 | #method-rgx= 451 | 452 | # Naming style matching correct module names. 453 | module-naming-style=snake_case 454 | 455 | # Regular expression matching correct module names. Overrides module-naming- 456 | # style. 457 | #module-rgx= 458 | 459 | # Colon-delimited sets of names that determine each other's naming style when 460 | # the name regexes allow several styles. 461 | name-group= 462 | 463 | # Regular expression which should only match function or class names that do 464 | # not require a docstring. 465 | no-docstring-rgx=^_ 466 | 467 | # List of decorators that produce properties, such as abc.abstractproperty. Add 468 | # to this list to register other decorators that produce valid properties. 469 | # These decorators are taken in consideration only for invalid-name. 470 | property-classes=abc.abstractproperty 471 | 472 | # Naming style matching correct variable names. 473 | variable-naming-style=snake_case 474 | 475 | # Regular expression matching correct variable names. Overrides variable- 476 | # naming-style. 477 | #variable-rgx= 478 | 479 | 480 | [STRING] 481 | 482 | # This flag controls whether the implicit-str-concat-in-sequence should 483 | # generate a warning on implicit string concatenation in sequences defined over 484 | # several lines. 485 | check-str-concat-over-line-jumps=no 486 | 487 | 488 | [IMPORTS] 489 | 490 | # List of modules that can be imported at any level, not just the top level 491 | # one. 492 | allow-any-import-level= 493 | 494 | # Allow wildcard imports from modules that define __all__. 495 | allow-wildcard-with-all=no 496 | 497 | # Analyse import fallback blocks. This can be used to support both Python 2 and 498 | # 3 compatible code, which means that the block might have code that exists 499 | # only in one or another interpreter, leading to false positives when analysed. 500 | analyse-fallback-blocks=no 501 | 502 | # Deprecated modules which should not be used, separated by a comma. 503 | deprecated-modules=optparse,tkinter.tix 504 | 505 | # Create a graph of external dependencies in the given file (report RP0402 must 506 | # not be disabled). 507 | ext-import-graph= 508 | 509 | # Create a graph of every (i.e. internal and external) dependencies in the 510 | # given file (report RP0402 must not be disabled). 511 | import-graph= 512 | 513 | # Create a graph of internal dependencies in the given file (report RP0402 must 514 | # not be disabled). 515 | int-import-graph= 516 | 517 | # Force import order to recognize a module as part of the standard 518 | # compatibility libraries. 519 | known-standard-library= 520 | 521 | # Force import order to recognize a module as part of a third party library. 522 | known-third-party=enchant 523 | 524 | # Couples of modules and preferred modules, separated by a comma. 525 | preferred-modules= 526 | 527 | 528 | [CLASSES] 529 | 530 | # List of method names used to declare (i.e. assign) instance attributes. 531 | defining-attr-methods=__init__, 532 | __new__, 533 | setUp, 534 | __post_init__ 535 | 536 | # List of member names, which should be excluded from the protected access 537 | # warning. 538 | exclude-protected=_asdict, 539 | _fields, 540 | _replace, 541 | _source, 542 | _make 543 | 544 | # List of valid names for the first argument in a class method. 545 | valid-classmethod-first-arg=cls 546 | 547 | # List of valid names for the first argument in a metaclass class method. 548 | valid-metaclass-classmethod-first-arg=cls 549 | 550 | 551 | [DESIGN] 552 | 553 | # Maximum number of arguments for function / method. 554 | max-args=5 555 | 556 | # Maximum number of attributes for a class (see R0902). 557 | max-attributes=7 558 | 559 | # Maximum number of boolean expressions in an if statement (see R0916). 560 | max-bool-expr=5 561 | 562 | # Maximum number of branch for function / method body. 563 | max-branches=12 564 | 565 | # Maximum number of locals for function / method body. 566 | max-locals=15 567 | 568 | # Maximum number of parents for a class (see R0901). 569 | max-parents=7 570 | 571 | # Maximum number of public methods for a class (see R0904). 572 | max-public-methods=20 573 | 574 | # Maximum number of return / yield for function / method body. 575 | max-returns=6 576 | 577 | # Maximum number of statements in function / method body. 578 | max-statements=50 579 | 580 | # Minimum number of public methods for a class (see R0903). 581 | min-public-methods=2 582 | 583 | 584 | [EXCEPTIONS] 585 | 586 | # Exceptions that will emit a warning when being caught. Defaults to 587 | # "BaseException, Exception". 588 | overgeneral-exceptions=BaseException, 589 | Exception 590 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | /mwscrape/ downloads rendered articles from MediaWiki sites via 2 | web API and stores them in CouchDB to enable further offline 3 | processing. 4 | 5 | ** Installation 6 | 7 | /mwscrape/ depends on the following: 8 | 9 | - [[http://couchdb.apache.org][CouchDB]] (1.3.0 or newer) 10 | - [[http://python.org][Python 3 (3.6 or newer]]) 11 | 12 | Consult your operating system documentation and these projects' 13 | websites for installation instructions. 14 | 15 | For example, on Ubuntu 22.04, the following command installs 16 | required packages: 17 | 18 | #+BEGIN_SRC sh 19 | sudo apt install python3 python3-venv curl apt-transport-https gnupg 20 | #+END_SRC 21 | 22 | To install CouchDB first download and save the Apache CouchDB repository public key 23 | 24 | #+BEGIN_SRC sh 25 | curl https://couchdb.apache.org/repo/keys.asc | gpg --dearmor | sudo tee /usr/share/keyrings/couchdb-archive-keyring.gpg >/dev/null 2>&1 26 | #+END_SRC 27 | 28 | Then аdd the official Apache CouchDB APT repository to your system’s sources list: 29 | 30 | #+BEGIN_SRC sh 31 | source /etc/os-release && echo "deb [signed-by=/usr/share/keyrings/couchdb-archive-keyring.gpg] https://apache.jfrog.io/artifactory/couchdb-deb/ ${VERSION_CODENAME} main" | sudo tee /etc/apt/sources.list.d/couchdb.list >/dev/null 32 | #+END_SRC 33 | 34 | And finally update the repository cache and install the package: 35 | 36 | #+BEGIN_SRC sh 37 | sudo apt update && sudo apt install couchdb 38 | #+END_SRC 39 | 40 | Alternatively, run CouchDB with [[https://www.docker.com/][docker]]: 41 | 42 | #+BEGIN_SRC sh 43 | docker run --detach --rm --name couchdb \ 44 | -v $(PWD)/.couchdb:/opt/couchdb/data \ 45 | -p 5984:5984 \ 46 | couchdb:2 47 | #+END_SRC 48 | 49 | See [[https://hub.docker.com/_/couchdb/][CouchDB Docker image docs]] for more details. 50 | 51 | Note that starting with CouchDB 3.0 an admin user must be set 52 | up. See [[https://docs.couchdb.org/en/stable/intro/security.html#creating-a-new-admin-user][CouchDB documentation]]. 53 | 54 | With docker: 55 | 56 | #+BEGIN_SRC sh 57 | docker run --detach --rm --name couchdb \ 58 | -e COUCHDB_USER=admin \ 59 | -e COUCHDB_PASSWORD=secret \ 60 | -v $(PWD)/.couchdb:/opt/couchdb/data \ 61 | -p 5984:5984 \ 62 | couchdb:3 63 | #+END_SRC 64 | 65 | By default CouchDB uses /snappy/ for file compression. Change 66 | ~file_compression~ configuration parameter in ~couchdb~ config section to 67 | /deflate_6/ (Maximum is /deflate_9/). This reduces database disc space usage 68 | significantly. 69 | 70 | Create new Python virtual environment: 71 | 72 | #+BEGIN_SRC sh 73 | python3 -m venv env-mwscrape 74 | #+END_SRC 75 | 76 | Activate it: 77 | 78 | #+BEGIN_SRC sh 79 | source env-mwscrape/bin/activate 80 | #+END_SRC 81 | 82 | Install /mwscrape/ from source: 83 | #+BEGIN_SRC sh 84 | pip install https://github.com/itkach/mwscrape/tarball/master 85 | #+END_SRC 86 | 87 | 88 | ** Usage 89 | 90 | #+BEGIN_SRC sh 91 | 92 | usage: mwscrape [-h] [--site-path SITE_PATH] [--site-ext SITE_EXT] [-c COUCH] 93 | [--db DB] [--titles TITLES [TITLES ...]] [--start START] 94 | [--changes-since CHANGES_SINCE] [--recent-days RECENT_DAYS] 95 | [--recent] [--timeout TIMEOUT] [-S] [-r [SESSION ID]] 96 | [--sessions-db-name SESSIONS_DB_NAME] [--desc] 97 | [--delete-not-found] [--speed {0,1,2,3,4,5}] 98 | [site] 99 | 100 | positional arguments: 101 | site MediaWiki site to scrape (host name), e.g. 102 | en.wikipedia.org 103 | 104 | optional arguments: 105 | -h, --help show this help message and exit 106 | --site-path SITE_PATH 107 | MediaWiki site API path. Default: /w/ 108 | --site-ext SITE_EXT MediaWiki site API script extension. Default: .php 109 | -c COUCH, --couch COUCH 110 | CouchDB server URL. Default: http://localhost:5984 111 | --db DB CouchDB database name. If not specified, the name will 112 | be derived from Mediawiki host name. 113 | --titles TITLES [TITLES ...] 114 | Download article pages with these names (titles). It 115 | name starts with @ it is interpreted as name of file 116 | containing titles, one per line, utf8 encoded. 117 | --start START Download all article pages beginning with this name 118 | --changes-since CHANGES_SINCE 119 | Download all article pages that change since specified 120 | time. Timestamp format is yyyymmddhhmmss. See 121 | https://www.mediawiki.org/wiki/Timestamp. Hours, 122 | minutes and seconds can be omited 123 | --recent-days RECENT_DAYS 124 | Number of days to look back for recent changes 125 | --recent Download recently changed articles only 126 | --timeout TIMEOUT Network communications timeout. Default: 30.0s 127 | -S, --siteinfo-only Fetch or update siteinfo, then exit 128 | -r [SESSION ID], --resume [SESSION ID] 129 | Resume previous scrape session. This relies on stats 130 | saved in mwscrape database. 131 | --sessions-db-name SESSIONS_DB_NAME 132 | Name of database where session info is stored. 133 | Default: mwscrape 134 | --desc Request all pages in descending order 135 | --delete-not-found Remove non-existing pages from the database 136 | --speed {0,1,2,3,4,5} 137 | Scrape speed 138 | --delay 139 | Pause before requesting rendered article for 140 | this many seconds. 141 | Some sites limit request rate so that even 142 | single-threaded, request-at-a-time scrapes 143 | are too fast and additional delay needs 144 | to be introduced 145 | --namespace ID of MediaWiki namespace to " "scrape. 146 | --user-agent HTTP user agent string. 147 | 148 | #+END_SRC 149 | 150 | The following examples are for with CouchDB < 3.0 running in admin party mode. 151 | 152 | To get English Wiktionary: 153 | 154 | #+BEGIN_SRC sh 155 | mwscrape en.wiktionary.org 156 | #+END_SRC 157 | 158 | To get the same but work through list of titles in reverse order: 159 | 160 | #+BEGIN_SRC sh 161 | mwscrape en.wiktionary.org --desc 162 | #+END_SRC 163 | 164 | Some sites expose Mediawiki API at path different from Wikipedia's 165 | default, specify it with ~--site-path~: 166 | 167 | #+BEGIN_SRC sh 168 | mwscrape lurkmore.to --site-path=/ 169 | #+END_SRC 170 | 171 | For CouchDB with admin user ~admin~ and password ~secret~ specify 172 | credentials as part of CouchDB URL: 173 | 174 | #+BEGIN_SRC sh 175 | mwscrape -c http://admin:secret@localhost:5984 en.wiktionary.org 176 | #+END_SRC 177 | 178 | /mwscrape/ compares page revisions reported by MediaWiki API with 179 | revisions of previously scraped pages in CouchDB and requests parsed 180 | page data if new revision is available. 181 | 182 | /mwscrape/ also creates a CouchDB design document ~w~ with show 183 | function ~html~ to allow viewing article html returned by MediaWiki 184 | API and navigating to html of other collected articles. 185 | For example, to view rendered html for article /A/ in 186 | database /simple-wikipedia-org/, in a web browser go to the 187 | following address (assuming CouchDB is running on localhost): 188 | 189 | http://127.0.0.1:5984/simple-wikipedia-org/_design/w/_show/html/A 190 | 191 | If databases are combined via replication articles with the same 192 | title will be stored as [[https://wiki.apache.org/couchdb/Replication_and_conflicts][conflicts]]. /mwresolvec/ script is 193 | provided to merge conflicting versions (combine aliases, select 194 | highest MediaWiki article revision, discard other 195 | revisions). 196 | Usage: 197 | 198 | #+BEGIN_SRC sh 199 | mwresolvec [-h] [-s START] [-b BATCH_SIZE] [-w WORKERS] [-v] couch_url 200 | 201 | positional arguments: 202 | couch_url 203 | 204 | optional arguments: 205 | -h, --help show this help message and exit 206 | -s START, --start START 207 | -b BATCH_SIZE, --batch-size BATCH_SIZE 208 | -w WORKERS, --workers WORKERS 209 | -v, --verbose 210 | 211 | #+END_SRC 212 | 213 | Example: 214 | 215 | #+BEGIN_SRC sh 216 | mwresolvec http://localhost:5984/en-m-wikipedia-org 217 | #+END_SRC 218 | -------------------------------------------------------------------------------- /mwscrape/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/itkach/mwscrape/0025fe1a9d389919552581a1f349f3c7f1bb513d/mwscrape/__init__.py -------------------------------------------------------------------------------- /mwscrape/resolveconflicts.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2014 Igor Tkach 2 | # 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | import argparse 8 | import couchdb 9 | import time 10 | 11 | from datetime import timedelta 12 | from urllib.parse import urlparse 13 | from concurrent import futures 14 | 15 | 16 | def parse_args(): 17 | argparser = argparse.ArgumentParser() 18 | argparser.add_argument("couch_url") 19 | argparser.add_argument("-s", "--start") 20 | argparser.add_argument("-b", "--batch-size", type=int, default=500) 21 | argparser.add_argument("-w", "--workers", type=int, default=50) 22 | argparser.add_argument("-v", "--verbose", action="store_true") 23 | return argparser.parse_args() 24 | 25 | 26 | def mkclient(couch_url): 27 | parsed_url = urlparse(couch_url) 28 | couch_db = parsed_url.path.lstrip("/") 29 | server_url = parsed_url.scheme + "://" + parsed_url.netloc 30 | server = couchdb.Server(server_url) 31 | username = parsed_url.username 32 | password = parsed_url.password 33 | print( 34 | "User %s%s at %s, database %s" 35 | % ( 36 | username, 37 | "" if password else " (no password)", 38 | server.resource.url, 39 | couch_db, 40 | ) 41 | ) 42 | if password: 43 | server.resource.credentials = (username, password) 44 | return server[couch_db] 45 | 46 | 47 | def resolve(db, doc_id, verbose=False): 48 | doc = db.get(doc_id, conflicts=True) 49 | conflicts = doc.get("_conflicts") 50 | messages = [] 51 | if conflicts: 52 | best_mw_revid = doc["parse"]["revid"] 53 | docs = [doc] 54 | best_doc = doc 55 | all_aliases = set(doc.get("aliases", ())) 56 | aliase_count = len(all_aliases) 57 | article_revisions = set([best_mw_revid]) 58 | for conflict_rev in conflicts: 59 | conflict_doc = db.get(doc_id, rev=conflict_rev) 60 | docs.append(conflict_doc) 61 | conflict_mw_revid = conflict_doc["parse"]["revid"] 62 | article_revisions.add(conflict_mw_revid) 63 | if conflict_mw_revid > best_mw_revid: 64 | best_mw_revid = conflict_mw_revid 65 | best_doc = conflict_doc 66 | aliases = set(doc.get("aliases", ())) 67 | all_aliases.update(aliases) 68 | new_aliases_count = len(all_aliases) - aliase_count 69 | article_rev_count = len(article_revisions) - 1 70 | if verbose: 71 | messages.append("------") 72 | messages.append( 73 | "%s [%d conflict(s): +%dr, +%da]" 74 | % (doc_id, len(conflicts), article_rev_count, new_aliases_count) 75 | ) 76 | for doc in docs: 77 | if doc.rev == best_doc.rev: 78 | if verbose: 79 | messages.append("Keeping %s" % doc.rev) 80 | doc["aliases"] = list(all_aliases) 81 | db.save(doc) 82 | else: 83 | if verbose: 84 | messages.append("Discarding %s" % doc.rev) 85 | db.delete(doc) 86 | result = True 87 | else: 88 | if verbose: 89 | messages.append("[no conflicts] %s" % doc_id) 90 | result = False 91 | if messages: 92 | print("\n".join(messages)) 93 | return result 94 | 95 | 96 | def main(): 97 | args = parse_args() 98 | db = mkclient(args.couch_url) 99 | viewoptions = {} 100 | if args.start: 101 | viewoptions["startkey"] = args.start 102 | viewoptions["startkey_docid"] = args.start 103 | 104 | t0 = time.time() 105 | with futures.ThreadPoolExecutor(max_workers=args.workers) as executor: 106 | for row in db.iterview("_all_docs", args.batch_size, **viewoptions): 107 | executor.submit(resolve, db, row.id, verbose=args.verbose) 108 | print("Done in %s" % timedelta(seconds=int(time.time() - t0))) 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /mwscrape/scrape.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2013-2014 Igor Tkach 2 | # 3 | # This Source Code Form is subject to the terms of the Mozilla Public 4 | # License, v. 2.0. If a copy of the MPL was not distributed with this 5 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. 6 | 7 | import argparse 8 | import fcntl 9 | import hashlib 10 | import os 11 | import random 12 | import socket 13 | import tempfile 14 | import time 15 | import traceback 16 | 17 | import urllib.parse 18 | 19 | from urllib.parse import urlparse 20 | from urllib.parse import urlunparse 21 | from collections import namedtuple 22 | from datetime import datetime, timedelta 23 | from multiprocessing import RLock 24 | from multiprocessing.pool import ThreadPool 25 | from contextlib import contextmanager 26 | 27 | import couchdb 28 | import mwclient 29 | import mwclient.page 30 | import mwclient.client 31 | import pylru 32 | 33 | import _thread 34 | 35 | 36 | def fix_server_url(general_siteinfo): 37 | """ 38 | Get server url from siteinfo's 'general' dict, 39 | add http if scheme is missing. This will also modify 40 | given dictionary. 41 | 42 | >>> general_siteinfo = {'server': '//simple.wikipedia.org'} 43 | >>> fix_server_url(general_siteinfo) 44 | 'http://simple.wikipedia.org' 45 | >>> general_siteinfo 46 | {'server': 'http://simple.wikipedia.org'} 47 | 48 | >>> fix_server_url({'server': 'https://en.wikipedia.org'}) 49 | 'https://en.wikipedia.org' 50 | 51 | >>> fix_server_url({}) 52 | '' 53 | 54 | """ 55 | server = general_siteinfo.get("server", "") 56 | if server: 57 | p = urlparse(server) 58 | if not p.scheme: 59 | server = urlunparse( 60 | urllib.parse.ParseResult( 61 | "http", p.netloc, p.path, p.params, p.query, p.fragment 62 | ) 63 | ) 64 | general_siteinfo["server"] = server 65 | return server 66 | 67 | 68 | def update_siteinfo(site, couch_server, db_name): 69 | try: 70 | siteinfo_db = couch_server.create("siteinfo") 71 | except couchdb.PreconditionFailed: 72 | siteinfo_db = couch_server["siteinfo"] 73 | 74 | siteinfo = site.api( 75 | "query", 76 | meta="siteinfo", 77 | siprop="general|interwikimap|rightsinfo|statistics|namespaces", 78 | )["query"] 79 | 80 | fix_server_url(siteinfo["general"]) 81 | 82 | siteinfo.pop("userinfo", None) 83 | 84 | siteinfo_doc = siteinfo_db.get(db_name) 85 | 86 | if siteinfo_doc: 87 | siteinfo_doc.update(siteinfo) 88 | else: 89 | siteinfo_doc = siteinfo 90 | 91 | siteinfo_db[db_name] = siteinfo_doc 92 | 93 | 94 | def parse_args(): 95 | argparser = argparse.ArgumentParser() 96 | argparser.add_argument( 97 | "site", 98 | nargs="?", 99 | help=("MediaWiki site to scrape (host name), " "e.g. en.wikipedia.org"), 100 | ) 101 | argparser.add_argument( 102 | "--site-path", 103 | default="/w/", 104 | help=("MediaWiki site API path" "Default: %(default)s"), 105 | ) 106 | argparser.add_argument( 107 | "--site-ext", 108 | default=".php", 109 | help=("MediaWiki site API script extension" "Default: %(default)s"), 110 | ) 111 | argparser.add_argument( 112 | "-c", 113 | "--couch", 114 | help=("CouchDB server URL. " "Default: %(default)s"), 115 | default="http://localhost:5984", 116 | ) 117 | argparser.add_argument( 118 | "--db", 119 | help=( 120 | "CouchDB database name. " 121 | "If not specified, the name will be " 122 | "derived from Mediawiki host name." 123 | ), 124 | default=None, 125 | ) 126 | argparser.add_argument( 127 | "--titles", 128 | nargs="+", 129 | help=( 130 | "Download article pages with " 131 | "these names (titles). " 132 | "It name starts with @ it is " 133 | "interpreted as name of file containing titles, " 134 | "one per line, utf8 encoded." 135 | ), 136 | ) 137 | argparser.add_argument( 138 | "--start", help=("Download all article pages " "beginning with this name") 139 | ) 140 | argparser.add_argument( 141 | "--changes-since", 142 | help=( 143 | "Download all article pages " 144 | "that change since specified time. " 145 | "Timestamp format is yyyymmddhhmmss. " 146 | "See https://www.mediawiki.org/wiki/Timestamp. " 147 | "Hours, minutes and seconds can be omited" 148 | ), 149 | ) 150 | argparser.add_argument( 151 | "--recent-days", 152 | type=int, 153 | default=1, 154 | help=("Number of days to look back for recent changes"), 155 | ) 156 | argparser.add_argument( 157 | "--recent", 158 | action="store_true", 159 | help=("Download recently changed articles only"), 160 | ) 161 | argparser.add_argument( 162 | "--timeout", 163 | default=30.0, 164 | type=float, 165 | help=("Network communications timeout. " "Default: %(default)ss"), 166 | ) 167 | argparser.add_argument( 168 | "-S", 169 | "--siteinfo-only", 170 | action="store_true", 171 | help=("Fetch or update siteinfo, then exit"), 172 | ) 173 | argparser.add_argument( 174 | "-r", 175 | "--resume", 176 | nargs="?", 177 | default="", 178 | metavar="SESSION ID", 179 | help=( 180 | "Resume previous scrape session. " 181 | "This relies on stats saved in " 182 | "mwscrape database." 183 | ), 184 | ) 185 | argparser.add_argument( 186 | "--sessions-db-name", 187 | default="mwscrape", 188 | help=( 189 | "Name of database where " "session info is stored. " "Default: %(default)s" 190 | ), 191 | ) 192 | argparser.add_argument( 193 | "--desc", action="store_true", help=("Request all pages in descending order") 194 | ) 195 | 196 | argparser.add_argument( 197 | "--delete-not-found", 198 | action="store_true", 199 | help=("Remove non-existing pages from the database"), 200 | ) 201 | 202 | argparser.add_argument( 203 | "--speed", type=int, choices=range(0, 6), default=0, help=("Scrape speed") 204 | ) 205 | 206 | argparser.add_argument( 207 | "--delay", 208 | type=float, 209 | default=0, 210 | help=( 211 | "Pause before requesting rendered article " 212 | "for this many seconds. Default: %(default)s." 213 | "Some sites limit request rate so that even " 214 | "single-threaded, request-at-a-time scrapes are too fast" 215 | "and additional delay needs to be introduced" 216 | ), 217 | ) 218 | 219 | argparser.add_argument( 220 | "--namespace", 221 | type=int, 222 | default=0, 223 | help=("ID of MediaWiki namespace to " "scrape. Default: %(default)s"), 224 | ) 225 | 226 | argparser.add_argument( 227 | "--user-agent", 228 | type=str, 229 | default=None, 230 | help=("HTTP user agent string. Default: %s" % mwclient.client.USER_AGENT), 231 | ) 232 | 233 | return argparser.parse_args() 234 | 235 | 236 | SHOW_FUNC = r""" 237 | function(doc, req) 238 | { 239 | var r = /href="\/wiki\/(.*?)"/gi; 240 | var replace = function(match, p1, offset, string) { 241 | return 'href="' + p1.replace(/_/g, ' ') + '"'; 242 | }; 243 | return doc.parse.text['*'].replace(r, replace); 244 | } 245 | """ 246 | 247 | 248 | def set_show_func(db, show_func=SHOW_FUNC, force=False): 249 | design_doc = db.get("_design/w", {}) 250 | shows = design_doc.get("shows", {}) 251 | if force or not shows.get("html"): 252 | shows["html"] = show_func 253 | design_doc["shows"] = shows 254 | db["_design/w"] = design_doc 255 | 256 | 257 | Redirect = namedtuple("Redirect", "page fragment") 258 | 259 | 260 | def redirects_to(site, from_title): 261 | """Same as mwclient.page.Page.redirects_to except it returns page and fragment 262 | in a named tuple instead of just target page 263 | """ 264 | info = site.api("query", prop="pageprops", titles=from_title, redirects="")["query"] 265 | if "redirects" in info: 266 | for page in info["redirects"]: 267 | if page["from"] == from_title: 268 | return Redirect( 269 | page=mwclient.page.Page(site, page["to"]), 270 | fragment=page.get("tofragment", ""), 271 | ) 272 | return None 273 | return None 274 | 275 | 276 | def scheme_and_host(site_host): 277 | p = urlparse(site_host) 278 | scheme = p.scheme if p.scheme else "https" 279 | host = p.netloc if p.scheme else site_host 280 | return scheme, host 281 | 282 | 283 | def mkcouch(url): 284 | parsed = urlparse(url) 285 | server_url = parsed.scheme + "://" + parsed.netloc 286 | server = couchdb.Server(server_url) 287 | user = parsed.username 288 | password = parsed.password 289 | if password: 290 | print("Connecting %s as user %s" % (server.resource.url, user)) 291 | server.resource.credentials = (user, password) 292 | return server 293 | 294 | 295 | @contextmanager 296 | def flock(path): 297 | with open(path, "w") as lock_fd: 298 | try: 299 | fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB) 300 | yield 301 | except IOError as ex: 302 | if ex.errno == 11: 303 | print( 304 | "Scrape for this host is already in progress. " 305 | "Use --speed option instead of starting multiple processes." 306 | ) 307 | raise SystemExit(1) 308 | finally: 309 | lock_fd.close() 310 | 311 | 312 | def fmt_mw_tms(dt): 313 | return datetime.strftime(dt, "%Y%m%d%H%M%S") 314 | 315 | 316 | def main(): 317 | args = parse_args() 318 | 319 | socket.setdefaulttimeout(args.timeout) 320 | 321 | couch_server = mkcouch(args.couch) 322 | 323 | sessions_db_name = args.sessions_db_name 324 | try: 325 | sessions_db = couch_server.create(sessions_db_name) 326 | except couchdb.PreconditionFailed: 327 | sessions_db = couch_server[sessions_db_name] 328 | 329 | if args.resume or args.resume is None: 330 | session_id = args.resume 331 | if session_id is None: 332 | current_doc = sessions_db["$current"] 333 | session_id = current_doc["session_id"] 334 | print("Resuming session %s" % session_id) 335 | session_doc = sessions_db[session_id] 336 | site_host = session_doc["site"] 337 | scheme, host = scheme_and_host(site_host) 338 | db_name = session_doc["db_name"] 339 | session_doc["resumed_at"] = datetime.utcnow().isoformat() 340 | if args.start: 341 | start_page_name = args.start 342 | else: 343 | start_page_name = session_doc.get("last_page_name", args.start) 344 | if args.desc: 345 | descending = True 346 | else: 347 | descending = session_doc.get("descending", False) 348 | sessions_db[session_id] = session_doc 349 | else: 350 | site_host = args.site 351 | db_name = args.db 352 | start_page_name = args.start 353 | descending = args.desc 354 | if not site_host: 355 | print("Site to scrape is not specified") 356 | raise SystemExit(1) 357 | scheme, host = scheme_and_host(site_host) 358 | if not db_name: 359 | db_name = host.replace(".", "-") 360 | session_id = "-".join( 361 | (db_name, str(int(time.time())), str(int(1000 * random.random()))) 362 | ) 363 | print("Starting session %s" % session_id) 364 | sessions_db[session_id] = { 365 | "created_at": datetime.utcnow().isoformat(), 366 | "site": site_host, 367 | "db_name": db_name, 368 | "descending": descending, 369 | } 370 | current_doc = sessions_db.get("$current", {}) 371 | current_doc["session_id"] = session_id 372 | sessions_db["$current"] = current_doc 373 | 374 | headers = {} 375 | if args.user_agent: 376 | headers = {"User-Agent": args.user_agent} 377 | site = mwclient.Site( 378 | host, 379 | path=args.site_path, 380 | ext=args.site_ext, 381 | scheme=scheme, 382 | custom_headers=headers, 383 | ) 384 | 385 | update_siteinfo(site, couch_server, db_name) 386 | 387 | if args.siteinfo_only: 388 | return 389 | 390 | try: 391 | db = couch_server.create(db_name) 392 | except couchdb.PreconditionFailed: 393 | db = couch_server[db_name] 394 | 395 | set_show_func(db) 396 | 397 | def titles_from_args(titles): 398 | for title in titles: 399 | if title.startswith("@"): 400 | with open(os.path.expanduser(title[1:])) as f: 401 | for line in f: 402 | yield line.strip() 403 | else: 404 | yield title 405 | 406 | def recently_changed_pages(timestamp): 407 | changes = site.recentchanges( 408 | start=timestamp, 409 | namespace=0, 410 | toponly=1, 411 | type="edit|new", 412 | dir="newer", 413 | show="!minor|!redirect|!anon|!bot", 414 | ) 415 | for page in changes: 416 | title = page.get("title") 417 | if title: 418 | doc = db.get(title) 419 | doc_revid = doc.get("parse", {}).get("revid") if doc else None 420 | revid = page.get("revid") 421 | if doc_revid == revid: 422 | continue 423 | yield title 424 | 425 | page_list = mwclient.listing.PageList(site, namespace=args.namespace) 426 | 427 | if args.titles: 428 | pages = (page_list[title] for title in titles_from_args(args.titles)) 429 | elif args.changes_since or args.recent: 430 | if args.recent: 431 | recent_days = args.recent_days 432 | changes_since = fmt_mw_tms(datetime.utcnow() + timedelta(days=-recent_days)) 433 | else: 434 | changes_since = args.changes_since.ljust(14, "0") 435 | print("Getting recent changes (since %s)" % changes_since) 436 | pages = (page_list[title] for title in recently_changed_pages(changes_since)) 437 | 438 | else: 439 | print("Starting at %s" % start_page_name) 440 | pages = site.allpages( 441 | start=start_page_name, 442 | namespace=args.namespace, 443 | dir="descending" if descending else "ascending", 444 | ) 445 | 446 | # threads are updating the same session document, 447 | # we don't want to have conflicts 448 | lock = RLock() 449 | 450 | def inc_count(count_name): 451 | with lock: 452 | session_doc = sessions_db[session_id] 453 | count = session_doc.get(count_name, 0) 454 | session_doc[count_name] = count + 1 455 | sessions_db[session_id] = session_doc 456 | 457 | def update_session(title): 458 | with lock: 459 | session_doc = sessions_db[session_id] 460 | session_doc["last_page_name"] = title 461 | session_doc["updated_at"] = datetime.utcnow().isoformat() 462 | sessions_db[session_id] = session_doc 463 | 464 | def process(page): 465 | title = page.name 466 | if not page.exists: 467 | print("Not found: %s" % title) 468 | inc_count("not_found") 469 | if args.delete_not_found: 470 | try: 471 | del db[title] 472 | except couchdb.ResourceNotFound: 473 | print("%s was not in the database" % title) 474 | except couchdb.ResourceConflict: 475 | print("Conflict while deleting %s" % title) 476 | except Exception: 477 | traceback.print_exc() 478 | else: 479 | print("%s removed from the database" % title) 480 | return 481 | try: 482 | aliases = set() 483 | redirect_count = 0 484 | while page.redirect: 485 | redirect_count += 1 486 | redirect_target = redirects_to(site, page.name) 487 | frag = redirect_target.fragment 488 | if frag: 489 | alias = (title, frag) 490 | else: 491 | alias = title 492 | aliases.add(alias) 493 | 494 | page = redirect_target.page 495 | print("%s ==> %s" % (title, page.name + (("#" + frag) if frag else ""))) 496 | 497 | if redirect_count >= 10: 498 | print("Too many redirect levels: %r" % aliases) 499 | break 500 | 501 | title = page.name 502 | 503 | if page.redirect: 504 | print("Failed to resolve redirect %s", title) 505 | inc_count("failed_redirect") 506 | return 507 | 508 | doc = db.get(title) 509 | if doc: 510 | current_aliases = set() 511 | for alias in doc.get("aliases", ()): 512 | if isinstance(alias, list): 513 | alias = tuple(alias) 514 | current_aliases.add(alias) 515 | if not aliases.issubset(current_aliases): 516 | merged_aliases = aliases | current_aliases 517 | # remove aliases without fragment if one with fragment is present 518 | # this is mostly to cleanup aliases in old scrapes 519 | to_remove = set() 520 | for alias in merged_aliases: 521 | if isinstance(alias, tuple): 522 | to_remove.add(alias[0]) 523 | merged_aliases = merged_aliases - to_remove 524 | doc["aliases"] = list(merged_aliases) 525 | db[title] = doc 526 | revid = doc.get("parse", {}).get("revid") 527 | if page.revision == revid: 528 | print("%s is up to date (rev. %s), skipping" % (title, revid)) 529 | inc_count("up_to_date") 530 | return 531 | inc_count("updated") 532 | print( 533 | "[%s] rev. %s => %s %s" 534 | % ( 535 | time.strftime("%x %X", (page.touched)) if page.touched else "?", 536 | revid, 537 | page.revision, 538 | title, 539 | ) 540 | ) 541 | if args.delay: 542 | time.sleep(args.delay) 543 | parse = site.api("parse", page=title) 544 | except KeyboardInterrupt as kbd: 545 | print("Caught KeyboardInterrupt", kbd) 546 | _thread.interrupt_main() 547 | except couchdb.ResourceConflict: 548 | print("Update conflict, skipping: %s" % title) 549 | return 550 | except Exception: 551 | print("Failed to process %s:" % title) 552 | traceback.print_exc() 553 | inc_count("error") 554 | return 555 | if doc: 556 | doc.update(parse) 557 | else: 558 | inc_count("new") 559 | doc = parse 560 | if aliases: 561 | doc["aliases"] = list(aliases) 562 | try: 563 | db[title] = doc 564 | except couchdb.ResourceConflict: 565 | print("Update conflict, skipping: %s" % title) 566 | return 567 | except Exception: 568 | print("Error handling title %r" % title) 569 | traceback.print_exc() 570 | 571 | seen = pylru.lrucache(10000) 572 | 573 | def ipages(pages): 574 | for index, page in enumerate(pages): 575 | title = page.name 576 | print("%7s %s" % (index, title)) 577 | if title in seen: 578 | print("Already saw %s, skipping" % (title,)) 579 | continue 580 | seen[title] = True 581 | update_session(title) 582 | yield page 583 | 584 | with flock( 585 | os.path.join( 586 | tempfile.gettempdir(), hashlib.sha1(host.encode("utf-8")).hexdigest() 587 | ) 588 | ): 589 | if args.speed and not args.delay: 590 | pool = ThreadPool(processes=args.speed * 2) 591 | for _result in pool.imap(process, ipages(pages)): 592 | pass 593 | 594 | else: 595 | for page in ipages(pages): 596 | process(page) 597 | 598 | 599 | if __name__ == "__main__": 600 | main() 601 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='mwscrape', 4 | version='1.0', 5 | description='Download', 6 | author='Igor Tkach', 7 | author_email='itkach@gmail.com', 8 | url='http://github.com/itkach/mwscrape', 9 | license='MPL 2.0', 10 | packages=['mwscrape'], 11 | #mwclient appears to need six, but doesn't declare it as dependency 12 | install_requires=['CouchDB >= 0.10', 'mwclient >= 0.10.0', 'pylru'], 13 | entry_points={'console_scripts': [ 14 | 'mwscrape=mwscrape.scrape:main', 15 | 'mwresolvec=mwscrape.resolveconflicts:main', 16 | ]}) 17 | --------------------------------------------------------------------------------