├── tests ├── __init__.py ├── extractors │ ├── __init__.py │ ├── api │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_extractor.py │ │ │ ├── test_util.py │ │ │ └── test_datasources.py │ └── test_extractor.py ├── features │ ├── __init__.py │ ├── bytes │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_revision_oriented.py │ ├── meta │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_vectorizers.py │ │ │ └── test_bools.py │ ├── temporal │ │ ├── __init__.py │ │ └── tests │ │ │ └── __init__.py │ ├── wikibase │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ └── test_util.py │ ├── wikitext │ │ ├── __init__.py │ │ └── tests │ │ │ └── __init__.py │ ├── test_modifiers.py │ └── test_functions.py ├── languages │ ├── __init__.py │ ├── features │ │ ├── __init__.py │ │ ├── dictionary │ │ │ ├── __init__.py │ │ │ └── tests │ │ │ │ ├── __init__.py │ │ │ │ └── test_util.py │ │ ├── stemmed │ │ │ ├── __init__.py │ │ │ └── tests │ │ │ │ └── __init__.py │ │ ├── stopwords │ │ │ ├── __init__.py │ │ │ └── tests │ │ │ │ └── __init__.py │ │ └── matches │ │ │ ├── tests │ │ │ └── __init__.py │ │ │ └── test_substrings.py │ ├── util.py │ ├── test_basque.py │ ├── test_hebrew.py │ ├── test_japanese.py │ ├── test_korean.py │ ├── test_persian.py │ └── test_finnish.py ├── scoring │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_util.py │ │ │ ├── test_model.py │ │ │ ├── test_random_forest.py │ │ │ ├── test_gradient_boosting.py │ │ │ ├── test_naive_bayes.py │ │ │ └── test_svc.py │ ├── statistics │ │ ├── __init__.py │ │ ├── tests │ │ │ └── __init__.py │ │ └── classification │ │ │ ├── __init__.py │ │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_counts.py │ │ │ ├── test_threshold_optimization.py │ │ │ ├── test_multilabel_counts.py │ │ │ ├── test_micro_macro_stats.py │ │ │ └── test_rates.py │ ├── test_environment.py │ ├── test_model_info.py │ ├── test_util.py │ └── test_labels.py ├── utilities │ ├── __init__.py │ ├── data │ │ ├── labeled_foo.json │ │ └── labeled_revisions.json │ ├── test_fetch_idioms.py │ ├── test_union_intersect_observations.py │ ├── test_util.py │ └── test_union_merge_observations.py ├── datasources │ ├── __init__.py │ ├── meta │ │ ├── __init__.py │ │ └── tests │ │ │ ├── __init__.py │ │ │ ├── test_gramming.py │ │ │ ├── test_hashing.py │ │ │ ├── test_dicts.py │ │ │ ├── test_vectorizers.py │ │ │ ├── test_extractors.py │ │ │ ├── test_filters.py │ │ │ ├── test_mappers.py │ │ │ └── test_frequencies.py │ ├── util.py │ └── test_datasource.py ├── dependencies │ ├── __init__.py │ ├── test_context.py │ └── test_dependent.py └── test_score_processor.py ├── docs ├── changelog.rst ├── requirements.txt ├── revscoring.scoring.rst ├── revscoring.features.rst ├── revscoring.languages.rst ├── revscoring.utilities.rst ├── revscoring.datasources.rst ├── revscoring.extractors.rst ├── revscoring.dependencies.rst ├── revscoring.features.meta.rst ├── revscoring.features.bytes.rst ├── revscoring.scoring.models.rst ├── revscoring.datasources.meta.rst ├── revscoring.features.temporal.rst ├── revscoring.features.wikibase.rst ├── revscoring.features.wikitext.rst ├── revscoring.features.modifiers.rst ├── revscoring.languages.features.rst ├── revscoring.scoring.statistics.rst ├── revscoring.features.revision_oriented.rst ├── revscoring.datasources.revision_oriented.rst ├── index.rst ├── notes │ └── 2015-10-07.language_structure.txt ├── api_reference.rst └── notes_on_adhoc_jobs.txt ├── MANIFEST.in ├── utility ├── revscoring ├── features │ ├── util.py │ ├── wikibase │ │ ├── datasources │ │ │ └── __init__.py │ │ ├── features │ │ │ └── __init__.py │ │ ├── revision_oriented.py │ │ ├── __init__.py │ │ └── util.py │ ├── wikitext │ │ ├── datasources │ │ │ ├── __init__.py │ │ │ ├── revision_oriented.py │ │ │ └── sentences.py │ │ ├── features │ │ │ ├── __init__.py │ │ │ └── revision_oriented.py │ │ └── revision_oriented.py │ ├── bytes │ │ ├── __init__.py │ │ ├── datasources.py │ │ └── revision_oriented.py │ ├── meta │ │ ├── __init__.py │ │ └── vectorizers.py │ ├── temporal │ │ └── __init__.py │ ├── feature_vector.py │ ├── modifiers.py │ ├── functions.py │ └── __init__.py ├── extractors │ ├── api │ │ ├── __init__.py │ │ └── util.py │ └── __init__.py ├── about.py ├── dependencies │ ├── util.py │ └── __init__.py ├── languages │ ├── basque.py │ ├── features │ │ ├── stemmed │ │ │ ├── __init__.py │ │ │ ├── stemmed.py │ │ │ └── datasources.py │ │ ├── stopwords │ │ │ ├── __init__.py │ │ │ └── stopwords.py │ │ ├── dictionary │ │ │ ├── __init__.py │ │ │ ├── dictionary.py │ │ │ └── util.py │ │ ├── __init__.py │ │ └── matches │ │ │ ├── __init__.py │ │ │ ├── substring_matches.py │ │ │ ├── regex_matches.py │ │ │ └── matches.py │ ├── korean.py │ ├── japanese.py │ ├── finnish.py │ ├── vietnamese.py │ ├── romanian.py │ └── hebrew.py ├── scoring │ ├── models │ │ ├── gradient_boosting.py │ │ ├── random_forest.py │ │ ├── linear.py │ │ ├── svc.py │ │ ├── naive_bayes.py │ │ ├── util.py │ │ └── __init__.py │ ├── statistics │ │ ├── __init__.py │ │ ├── statistics.py │ │ └── classification │ │ │ ├── label_thresholds.py │ │ │ ├── __init__.py │ │ │ └── rates.py │ ├── __init__.py │ └── util.py ├── datasources │ ├── meta │ │ ├── timestamp.py │ │ ├── __init__.py │ │ ├── indexable.py │ │ ├── hashing.py │ │ ├── gramming.py │ │ └── dicts.py │ ├── datasource.py │ └── __init__.py ├── utilities │ ├── check_model.py │ ├── model_info.py │ ├── __init__.py │ ├── intersect_merge_observations.py │ ├── union_merge_observations.py │ ├── test_model.py │ ├── fetch_idioms.py │ └── fit.py └── revscoring.py ├── .codecov.yml ├── scripts └── deploy.sh ├── CODE_OF_CONDUCT.md ├── setup.cfg ├── test-requirements.txt ├── config ├── logistic_regression.params.yaml ├── random_forest.params.yaml ├── gradient_boost.params.yaml ├── naive_bayes.params.yaml └── svc.params.yaml ├── pytest.ini ├── tox.ini ├── examples ├── scoring.py ├── language_support.py └── extraction.py ├── .github ├── dependabot.yml └── workflows │ ├── ci.yml │ └── publish_python_package.yml ├── requirements.txt ├── Dockerfile ├── .gitignore ├── Makefile ├── LICENSE ├── setup.py └── .travis.yml /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scoring/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/datasources/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/dependencies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/extractors/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/bytes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/meta/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scoring/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/datasources/meta/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/extractors/api/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/bytes/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/meta/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/temporal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/wikibase/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/wikitext/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/features/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scoring/models/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scoring/statistics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/temporal/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/wikibase/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/features/wikitext/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scoring/statistics/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/features/dictionary/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/features/stemmed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/features/stopwords/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. mdinclude:: ../CHANGELOG.md 2 | -------------------------------------------------------------------------------- /tests/languages/features/dictionary/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/features/matches/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/features/stemmed/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/languages/features/stopwords/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scoring/statistics/classification/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/scoring/statistics/classification/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==2.4.4 2 | sphinx-py3doc-enhanced-theme 3 | m2r 4 | -e . 5 | -------------------------------------------------------------------------------- /tests/utilities/data/labeled_foo.json: -------------------------------------------------------------------------------- 1 | {"foo": 1, "goodfaith": 0, "rev_id": "16124390"} 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE README.md requirements.txt 2 | include revscoring/assets/*.txt 3 | -------------------------------------------------------------------------------- /utility: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from revscoring import revscoring 3 | 4 | revscoring.main() 5 | -------------------------------------------------------------------------------- /revscoring/features/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | SECTION_COMMENT_RE = re.compile(r"\/\*([^\*]|\*[^\/])+\*\/") 4 | -------------------------------------------------------------------------------- /docs/revscoring.scoring.rst: -------------------------------------------------------------------------------- 1 | revscoring.scoring 2 | ================== 3 | 4 | .. automodule:: revscoring.scoring 5 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | notify: 3 | require_ci_to_pass: no 4 | 5 | coverage: 6 | status: 7 | patch: off 8 | -------------------------------------------------------------------------------- /docs/revscoring.features.rst: -------------------------------------------------------------------------------- 1 | revscoring.features 2 | =================== 3 | 4 | .. automodule:: revscoring.features 5 | -------------------------------------------------------------------------------- /docs/revscoring.languages.rst: -------------------------------------------------------------------------------- 1 | revscoring.languages 2 | ==================== 3 | 4 | .. automodule:: revscoring.languages 5 | -------------------------------------------------------------------------------- /docs/revscoring.utilities.rst: -------------------------------------------------------------------------------- 1 | revscoring.utilities 2 | ==================== 3 | 4 | .. automodule:: revscoring.utilities 5 | -------------------------------------------------------------------------------- /revscoring/features/wikibase/datasources/__init__.py: -------------------------------------------------------------------------------- 1 | from .revision_oriented import Revision 2 | 3 | __all__ = [Revision] 4 | -------------------------------------------------------------------------------- /revscoring/features/wikitext/datasources/__init__.py: -------------------------------------------------------------------------------- 1 | from .revision_oriented import Revision 2 | 3 | __all__ = [Revision] 4 | -------------------------------------------------------------------------------- /docs/revscoring.datasources.rst: -------------------------------------------------------------------------------- 1 | revscoring.datasources 2 | ====================== 3 | 4 | .. automodule:: revscoring.datasources 5 | -------------------------------------------------------------------------------- /docs/revscoring.extractors.rst: -------------------------------------------------------------------------------- 1 | revscoring.extractors 2 | ===================== 3 | 4 | .. automodule:: revscoring.extractors 5 | -------------------------------------------------------------------------------- /revscoring/extractors/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .extractor import Extractor, MWAPICache 2 | 3 | __all__ = [Extractor, MWAPICache] 4 | -------------------------------------------------------------------------------- /docs/revscoring.dependencies.rst: -------------------------------------------------------------------------------- 1 | revscoring.dependencies 2 | ======================= 3 | 4 | .. automodule:: revscoring.dependencies 5 | -------------------------------------------------------------------------------- /revscoring/features/wikitext/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .revision_oriented import Diff, Revision 2 | 3 | __all__ = [Revision, Diff] 4 | -------------------------------------------------------------------------------- /scripts/deploy.sh: -------------------------------------------------------------------------------- 1 | python setup.py sdist bdist_wheel && twine upload dist/* --skip-existing --username $PYPI_USER --password $PYPI_PASS 2 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | The development of this software is covered by a [Code of Conduct](https://www.mediawiki.org/wiki/Code_of_Conduct). 2 | 3 | -------------------------------------------------------------------------------- /docs/revscoring.features.meta.rst: -------------------------------------------------------------------------------- 1 | revscoring.features.meta 2 | ======================== 3 | 4 | .. automodule:: revscoring.features.meta 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal = 1 3 | 4 | [build_sphinx] 5 | source-dir = doc/ 6 | build-dir = doc/_build 7 | all_files = 1 8 | 9 | -------------------------------------------------------------------------------- /docs/revscoring.features.bytes.rst: -------------------------------------------------------------------------------- 1 | revscoring.features.bytes 2 | ========================= 3 | 4 | .. automodule:: revscoring.features.bytes 5 | -------------------------------------------------------------------------------- /docs/revscoring.scoring.models.rst: -------------------------------------------------------------------------------- 1 | revscoring.scoring.models 2 | ========================= 3 | 4 | .. automodule:: revscoring.scoring.models 5 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest >= 4.6.0, < 4.6.999 2 | codecov 3 | pytest-cov 4 | google-compute-engine <= 2.7.999 5 | flake8 >= 3.8.1, < 3.8.999 6 | -------------------------------------------------------------------------------- /docs/revscoring.datasources.meta.rst: -------------------------------------------------------------------------------- 1 | revscoring.datasources.meta 2 | =========================== 3 | 4 | .. automodule:: revscoring.datasources.meta 5 | -------------------------------------------------------------------------------- /docs/revscoring.features.temporal.rst: -------------------------------------------------------------------------------- 1 | revscoring.features.temporal 2 | ============================ 3 | 4 | .. automodule:: revscoring.features.temporal 5 | -------------------------------------------------------------------------------- /docs/revscoring.features.wikibase.rst: -------------------------------------------------------------------------------- 1 | revscoring.features.wikibase 2 | ============================ 3 | 4 | .. automodule:: revscoring.features.wikibase 5 | -------------------------------------------------------------------------------- /docs/revscoring.features.wikitext.rst: -------------------------------------------------------------------------------- 1 | revscoring.features.wikitext 2 | ============================ 3 | 4 | .. automodule:: revscoring.features.wikitext 5 | -------------------------------------------------------------------------------- /revscoring/features/wikibase/features/__init__.py: -------------------------------------------------------------------------------- 1 | from .diff import Diff 2 | from .revision_oriented import Revision 3 | 4 | __all__ = [Revision, Diff] 5 | -------------------------------------------------------------------------------- /docs/revscoring.features.modifiers.rst: -------------------------------------------------------------------------------- 1 | revscoring.features.modifiers 2 | ============================= 3 | 4 | .. automodule:: revscoring.features.modifiers 5 | -------------------------------------------------------------------------------- /docs/revscoring.languages.features.rst: -------------------------------------------------------------------------------- 1 | revscoring.languages.features 2 | ============================= 3 | 4 | .. automodule:: revscoring.languages.features 5 | -------------------------------------------------------------------------------- /docs/revscoring.scoring.statistics.rst: -------------------------------------------------------------------------------- 1 | revscoring.scoring.statistics 2 | ============================= 3 | 4 | .. automodule:: revscoring.scoring.statistics 5 | -------------------------------------------------------------------------------- /config/logistic_regression.params.yaml: -------------------------------------------------------------------------------- 1 | LogisticRegression: 2 | class: revscoring.scoring.models.LogisticRegression 3 | params: 4 | penalty: ["l1", "l2"] 5 | C: [0.1, 1, 10] 6 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | nottravis: marks tests as being known to fail in travis for unconcerning reasons 4 | addopts = --ignore=revscoring/utilities/test_model.py 5 | -------------------------------------------------------------------------------- /docs/revscoring.features.revision_oriented.rst: -------------------------------------------------------------------------------- 1 | revscoring.features.revision_oriented 2 | ===================================== 3 | 4 | .. automodule:: revscoring.features.revision_oriented 5 | -------------------------------------------------------------------------------- /docs/revscoring.datasources.revision_oriented.rst: -------------------------------------------------------------------------------- 1 | revscoring.datasources.revision_oriented 2 | ======================================== 3 | 4 | .. automodule:: revscoring.datasources.revision_oriented 5 | -------------------------------------------------------------------------------- /tests/datasources/util.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources.datasource import Datasource 4 | 5 | 6 | def check_datasource(ds): 7 | assert isinstance(ds, Datasource) 8 | assert pickle.loads(pickle.dumps(ds)) == ds 9 | -------------------------------------------------------------------------------- /tests/languages/features/dictionary/tests/test_util.py: -------------------------------------------------------------------------------- 1 | 2 | from revscoring.languages.features.dictionary.util import utf16_cleanup 3 | 4 | 5 | def test_utf16_cleanup(): 6 | assert (utf16_cleanup("Foobar" + chr(2 ** 16)) == 7 | "Foobar\uFFFD") 8 | -------------------------------------------------------------------------------- /config/random_forest.params.yaml: -------------------------------------------------------------------------------- 1 | RandomForest: 2 | class: revscoring.scoring.models.RandomForest 3 | params: 4 | n_estimators: [128, 256, 512, 1024] 5 | min_samples_leaf: [1, 3, 5, 7, 13] 6 | max_features: ["log2"] 7 | criterion: ["gini", "entropy"] 8 | -------------------------------------------------------------------------------- /config/gradient_boost.params.yaml: -------------------------------------------------------------------------------- 1 | GradientBoosting: 2 | class: revscoring.scoring.models.GradientBoosting 3 | params: 4 | n_estimators: [500, 700, 1200, 1500, 2000] 5 | max_depth: [5, 7, 9, 11] 6 | max_features: ["log2"] 7 | learning_rate: [0.001, 0.01, 0.1, 0.5] 8 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Revision Scoring 2 | ================ 3 | .. automodule:: revscoring 4 | 5 | Project Info: 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | 10 | changelog 11 | 12 | 13 | Indices and tables 14 | ================== 15 | 16 | * :ref:`genindex` 17 | * :ref:`modindex` 18 | * :ref:`search` 19 | -------------------------------------------------------------------------------- /config/naive_bayes.params.yaml: -------------------------------------------------------------------------------- 1 | GaussianNB: 2 | class: revscoring.scoring.models.GaussianNB 3 | params: {} 4 | BernoulliNB: 5 | class: revscoring.scoring.models.BernoulliNB 6 | params: {} 7 | MultinomialNB: 8 | class: revscoring.scoring.models.MultinomialNB 9 | params: 10 | alpha: [0.1, 1, 10] 11 | -------------------------------------------------------------------------------- /revscoring/features/wikitext/revision_oriented.py: -------------------------------------------------------------------------------- 1 | from revscoring.datasources import revision_oriented 2 | 3 | from . import datasources 4 | from .features import Revision 5 | 6 | name = "wikitext.revision" 7 | 8 | revision = Revision( 9 | name, 10 | datasources.Revision(name, revision_oriented.revision) 11 | ) 12 | -------------------------------------------------------------------------------- /revscoring/about.py: -------------------------------------------------------------------------------- 1 | __name__ = "revscoring" 2 | __version__ = "2.11.13" 3 | __author__ = "Aaron Halfaker" 4 | __author_email__ = "ahalfaker@wikimedia.org" 5 | __description__ = ("A set of utilities for generating quality scores for" + 6 | " MediaWiki revisions") 7 | __url__ = "https://github.com/wikimedia/revscoring" 8 | __license__ = "MIT" 9 | -------------------------------------------------------------------------------- /config/svc.params.yaml: -------------------------------------------------------------------------------- 1 | SVC: 2 | class: sklearn.svm.SVC 3 | params: 4 | - 5 | kernel: ["rbf"] 6 | probability: [true] 7 | gamma: [0.0, 0.001, 0.0001] 8 | cache_size: [1000] 9 | C: [0.1, 1, 10] 10 | - 11 | kernel: ["linear"] 12 | probability: [true] 13 | cache_size: [1000] 14 | C: [0.1, 1, 10] 15 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | # This file is code-generated by Sphinx, so we don't care. 4 | doc/conf.py 5 | ignore = E113, 6 | E111, 7 | E126, 8 | E127, 9 | E131, 10 | E305, 11 | # E501 line too long (> 79 characters) 12 | E501 13 | W504 14 | E741 15 | 16 | [pytest] 17 | addopts = --ignore revscoring/utilities/test_model.py 18 | -------------------------------------------------------------------------------- /revscoring/features/bytes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This features module provides access to features of the bytes of content in 3 | revisions. 4 | 5 | .. autodata:: revscoring.features.bytes.revision 6 | 7 | Supporting classes 8 | ++++++++++++++++++ 9 | 10 | .. autoclass:: revscoring.features.bytes.Revision 11 | :members: 12 | 13 | """ 14 | from .revision_oriented import Revision, revision 15 | 16 | __all__ = [revision, Revision] 17 | -------------------------------------------------------------------------------- /examples/scoring.py: -------------------------------------------------------------------------------- 1 | import mwapi 2 | 3 | from revscoring import Model 4 | from revscoring.extractors import api 5 | 6 | with open("models/enwiki.damaging.linear_svc.model") as f: 7 | model = Model.load(f) 8 | 9 | extractor = api.Extractor(mwapi.Session(host="https://en.wikipedia.org", 10 | user_agent="revscoring demo")) 11 | values = extractor.extract(123456789, model.features) 12 | print(model.score(values)) 13 | -------------------------------------------------------------------------------- /revscoring/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains a collection of utilities for extracting 3 | :class:`~revscoring.Feature` and 4 | :class:`~revscoring.Datasource` for a revision. 5 | 6 | api 7 | +++ 8 | .. automodule:: revscoring.extractors.api 9 | 10 | extractor 11 | +++++++++ 12 | .. automodule:: revscoring.extractors.extractor 13 | 14 | """ 15 | from .extractor import Extractor, OfflineExtractor 16 | 17 | __all__ = [Extractor, OfflineExtractor] 18 | -------------------------------------------------------------------------------- /tests/utilities/data/labeled_revisions.json: -------------------------------------------------------------------------------- 1 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124390"} 2 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124357"} 3 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16123622"} 4 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124436"} 5 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124458"} 6 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124488"} 7 | -------------------------------------------------------------------------------- /examples/language_support.py: -------------------------------------------------------------------------------- 1 | from revscoring.datasources.revision_oriented import revision 2 | from revscoring.dependencies import solve 3 | from revscoring.languages import english, spanish 4 | 5 | features = [english.informals.revision.matches, 6 | spanish.informals.revision.matches] 7 | values = solve(features, cache={revision.text: "I think it is stupid."}) 8 | 9 | for feature, value in zip(features, values): 10 | print("\t{0}: {1}".format(feature, repr(value))) 11 | -------------------------------------------------------------------------------- /tests/datasources/test_datasource.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources.datasource import Datasource 4 | from revscoring.dependencies import solve 5 | 6 | 7 | def test_datasource(): 8 | 9 | d = Datasource("d") 10 | 11 | assert pickle.loads(pickle.dumps(d)) == d 12 | 13 | assert solve(d, cache={d: "foo"}) == "foo" 14 | 15 | assert solve(d, cache={"datasource.d": "foo"}) == "foo" 16 | 17 | assert str(d) == "datasource.d" 18 | assert repr(d) == "" 19 | -------------------------------------------------------------------------------- /tests/scoring/models/tests/test_util.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | from revscoring.scoring.models.util import normalize_json 4 | 5 | 6 | def test_normalize_json(): 7 | doc = {"foo": {numpy.bool_(True): "value"}, 8 | "what": numpy.bool_(False), 9 | "this": numpy.PINF} 10 | normalized_doc = normalize_json(doc) 11 | assert isinstance(normalized_doc['what'], bool) 12 | assert isinstance(list(normalized_doc['foo'].keys())[0], bool) 13 | assert normalized_doc['this'] == "Infinity" 14 | -------------------------------------------------------------------------------- /revscoring/features/meta/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Meta-Features are generalized :class:`revscoring.Datasource` --> 3 | :class:`revscoring.Feature` conversions implemented for convenience. 4 | 5 | aggregators 6 | +++++++++++ 7 | .. automodule:: revscoring.features.meta.aggregators 8 | :members: 9 | 10 | bools 11 | +++++ 12 | .. automodule:: revscoring.features.meta.bools 13 | :members: 14 | 15 | vectorizers 16 | +++++++++++ 17 | .. automodule:: revscoring.features.meta.vectorizers 18 | :members: 19 | 20 | """ 21 | -------------------------------------------------------------------------------- /tests/extractors/api/tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | from revscoring.extractors.api.extractor import Extractor 2 | 3 | 4 | def test_from_config(): 5 | config = { 6 | 'extractors': { 7 | 'enwiki': { 8 | 'host': "https://en.wikipedia.org", 9 | 'api_path': "/w/api.php", 10 | 'timeout': 20, 11 | 'user_agent': "revscoring tests" 12 | } 13 | } 14 | } 15 | 16 | Extractor.from_config(config, 'enwiki') # Doesn't error 17 | -------------------------------------------------------------------------------- /revscoring/dependencies/util.py: -------------------------------------------------------------------------------- 1 | 2 | class or_none: 3 | """ 4 | Constructs a callable that will return None if the input is None, but will 5 | otherwise run a function on incoming data. 6 | 7 | :Parameters: 8 | func : `function` 9 | A function to run on non-None inputs 10 | """ 11 | def __init__(self, func): 12 | self.func = func 13 | 14 | def __call__(self, val): 15 | if val is None: 16 | return None 17 | else: 18 | return self.func(val) 19 | -------------------------------------------------------------------------------- /revscoring/languages/basque.py: -------------------------------------------------------------------------------- 1 | from .features import Dictionary 2 | 3 | name = "basque" 4 | 5 | try: 6 | import enchant 7 | dictionary = enchant.Dict("eu") 8 | except enchant.errors.DictNotFoundError: 9 | raise ImportError("No enchant-compatible dictionary found for 'eu'. " + 10 | "Consider installing 'hunspell-eu'.") 11 | 12 | dictionary = Dictionary(name + ".dictionary", dictionary.check) 13 | """ 14 | :class:`~revscoring.languages.features.Dictionary` features via 15 | :class:`enchant.Dict` "eu". Provided by `hunspell-eu` 16 | """ 17 | -------------------------------------------------------------------------------- /tests/test_score_processor.py: -------------------------------------------------------------------------------- 1 | from revscoring import Model 2 | from revscoring.extractors import OfflineExtractor 3 | from revscoring.features import Constant 4 | from revscoring.score_processor import ScoreProcessor 5 | 6 | 7 | class FakeModel(Model): 8 | 9 | def score(featue_values): 10 | return not featue_values[0] 11 | 12 | 13 | def test_score_processor(): 14 | 15 | model = FakeModel([Constant(False)]) 16 | 17 | sp = ScoreProcessor(model, OfflineExtractor()) 18 | scores = sp.score([1, 2, 3]) 19 | 20 | for score in scores: 21 | assert score 22 | -------------------------------------------------------------------------------- /revscoring/features/wikibase/revision_oriented.py: -------------------------------------------------------------------------------- 1 | from revscoring.datasources import revision_oriented 2 | 3 | from . import datasources, features 4 | 5 | name = "wikibase.revision" 6 | 7 | revision = features.Revision( 8 | name, 9 | datasources.Revision(name, revision_oriented.revision) 10 | ) 11 | """ 12 | Represents the base revision of interest. Implements this basic structure: 13 | 14 | * revision: :class:`~revscoring.features.wikibase.Revision` 15 | * parent: :class:`~revscoring.features.wikibase.Revision` 16 | * diff: :class:`~revscoring.features.wikibase.Diff` 17 | """ 18 | -------------------------------------------------------------------------------- /revscoring/scoring/models/gradient_boosting.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of Gradient Boosting type classifier models. 3 | 4 | .. autoclass:: revscoring.scoring.models.GradientBoosting 5 | :members: 6 | :member-order: 7 | """ 8 | import logging 9 | 10 | from sklearn.ensemble import GradientBoostingClassifier 11 | 12 | from .sklearn import ProbabilityClassifier 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class GradientBoosting(ProbabilityClassifier): 18 | """ 19 | Implements a Gradient Boosting model. 20 | """ 21 | Estimator = GradientBoostingClassifier 22 | -------------------------------------------------------------------------------- /tests/utilities/test_fetch_idioms.py: -------------------------------------------------------------------------------- 1 | from revscoring.utilities.fetch_idioms import is_idiom 2 | 3 | 4 | def test_is_idiom(): 5 | phrases = [ 6 | 'Appendix:English 19th Century idioms', 7 | 'about to', 8 | 'activist justice', 9 | 'attaboy', 10 | 'bat for the other team', 11 | 'beard the lion in his den', 12 | 'as gentle as a dove', 13 | 'I\'ll say' 14 | ] 15 | idioms = [phrase for phrase in phrases if is_idiom(phrase)] 16 | 17 | assert idioms == ['bat for the other team', 'beard the lion in his den'] 18 | -------------------------------------------------------------------------------- /revscoring/scoring/models/random_forest.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of Random Forest type classifier models. 3 | 4 | .. autoclass:: revscoring.scoring.models.RandomForest 5 | :members: 6 | :member-order: 7 | """ 8 | import logging 9 | 10 | from sklearn.ensemble import RandomForestClassifier 11 | 12 | from .sklearn import ProbabilityClassifier 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class RandomForest(ProbabilityClassifier): 18 | """ 19 | Implements a Random Forest model. 20 | """ 21 | Estimator = RandomForestClassifier 22 | SUPPORTS_CLASSWEIGHT = True 23 | -------------------------------------------------------------------------------- /examples/extraction.py: -------------------------------------------------------------------------------- 1 | from mwapi import Session 2 | 3 | from revscoring.extractors import api 4 | from revscoring.features import temporal, wikitext 5 | 6 | session = Session("https://en.wikipedia.org/w/api.php", user_agent="test") 7 | api_extractor = api.Extractor(session) 8 | 9 | features = [temporal.revision.day_of_week, 10 | temporal.revision.hour_of_day, 11 | wikitext.revision.parent.headings_by_level(2)] 12 | 13 | values = api_extractor.extract(624577024, features) 14 | for feature, value in zip(features, values): 15 | print("\t{0}: {1}".format(feature, repr(value))) 16 | -------------------------------------------------------------------------------- /revscoring/features/bytes/datasources.py: -------------------------------------------------------------------------------- 1 | from revscoring.datasources import Datasource 2 | 3 | 4 | class Revision: 5 | 6 | def __init__(self, prefix, revision_datasources): 7 | 8 | self.bytes = Datasource( 9 | prefix + ".bytes", _process_bytes, 10 | depends_on=[revision_datasources.text] 11 | ) 12 | 13 | if hasattr(revision_datasources, "parent"): 14 | self.parent = Revision( 15 | prefix + ".parent", 16 | revision_datasources.parent 17 | ) 18 | 19 | 20 | def _process_bytes(text): 21 | return bytes(text, 'utf-8', 'replace') 22 | -------------------------------------------------------------------------------- /revscoring/languages/features/stemmed/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements a feature set based off of stemmer applied to words. 3 | 4 | .. autoclass:: revscoring.languages.features.Stemmed 5 | :members: 6 | :member-order: bysource 7 | 8 | Supporting classes 9 | ------------------ 10 | 11 | .. autoclass:: revscoring.languages.features.stemmed.Revision 12 | :members: 13 | :member-order: bysource 14 | 15 | .. autoclass:: revscoring.languages.features.stemmed.Diff 16 | :members: 17 | :member-order: bysource 18 | """ 19 | from .features import Diff, Revision 20 | from .stemmed import Stemmed 21 | 22 | __all__ = [Stemmed, Revision, Diff] 23 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/test_gramming.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources.datasource import Datasource 4 | from revscoring.datasources.meta import gramming 5 | from revscoring.dependencies import solve 6 | 7 | my_tokens = Datasource("my_tokens") 8 | my_grams = gramming.gram(my_tokens, grams=[(0,), (0, 2)]) 9 | 10 | 11 | def test_gramming(): 12 | assert (solve(my_grams, cache={my_tokens: ["one", "two", "three", "four"]}) == 13 | [("one",), ("one", "three"), ("two",), ("two", "four"), ("three",), 14 | ("four",)]) 15 | 16 | assert (pickle.loads(pickle.dumps(my_grams)) == 17 | my_grams) 18 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | commit-message: 13 | # Prefix all commit messages with "pip" 14 | prefix: "pip" 15 | -------------------------------------------------------------------------------- /revscoring/languages/features/stopwords/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements a feature set based off of filtering words for stopwords 3 | 4 | .. autoclass:: revscoring.languages.features.Stopwords 5 | :members: 6 | :member-order: bysource 7 | 8 | Supporting classes 9 | ------------------ 10 | 11 | .. autoclass:: revscoring.languages.features.stopwords.Revision 12 | :members: 13 | :member-order: bysource 14 | 15 | .. autoclass:: revscoring.languages.features.stopwords.Diff 16 | :members: 17 | :member-order: bysource 18 | """ 19 | from .features import Diff, Revision 20 | from .stopwords import Stopwords 21 | 22 | __all__ = [Stopwords, Revision, Diff] 23 | -------------------------------------------------------------------------------- /tests/features/wikibase/tests/test_util.py: -------------------------------------------------------------------------------- 1 | 2 | from revscoring.features.wikibase.util import diff_dicts 3 | 4 | 5 | def test_diff_dicts(): 6 | 7 | diff = diff_dicts(None, {'a': 1, 'b': 2}) 8 | assert diff.added == {'a', 'b'} 9 | assert diff.removed == set() 10 | assert diff.intersection == set() 11 | assert diff.changed == set() 12 | assert diff.unchanged == set() 13 | 14 | diff = diff_dicts({'a': 1, 'b': 2, 'c': 3}, {'a': 1, 'b': 3, 'd': 10}) 15 | assert diff.added == {'d'} 16 | assert diff.removed == {'c'} 17 | assert diff.intersection == {'a', 'b'} 18 | assert diff.changed == {'b'} 19 | assert diff.unchanged == {'a'} 20 | -------------------------------------------------------------------------------- /revscoring/features/wikibase/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This features module provides access to features of the bytes of content in 3 | revisions. 4 | 5 | .. autodata:: revscoring.features.wikibase.revision 6 | 7 | Supporting classes 8 | ++++++++++++++++++ 9 | 10 | .. autoclass:: revscoring.features.wikibase.Revision 11 | :members: 12 | :member-order: bysource 13 | 14 | .. autoclass:: revscoring.features.wikibase.Diff 15 | :members: 16 | :member-order: bysource 17 | 18 | """ 19 | from .features import Diff, Revision 20 | from .revision_oriented import revision 21 | from .util import DictDiff, diff_dicts 22 | 23 | __all__ = [diff_dicts, DictDiff, revision, Revision, Diff] 24 | -------------------------------------------------------------------------------- /revscoring/dependencies/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides a general set of utilities for implementing a set of 3 | dependencies, solving them and injecting context and cache. 4 | 5 | .. automodule:: revscoring.dependencies.dependent 6 | 7 | functions 8 | +++++++++ 9 | .. automodule:: revscoring.dependencies.functions 10 | 11 | context 12 | +++++++ 13 | .. automodule:: revscoring.dependencies.context 14 | """ 15 | 16 | from .context import Context 17 | from .dependent import Dependent, DependentSet 18 | from .functions import dig, draw, expand, normalize_context, solve 19 | 20 | __all__ = [solve, expand, dig, draw, normalize_context, Context, Dependent, 21 | DependentSet] 22 | -------------------------------------------------------------------------------- /tests/utilities/test_union_intersect_observations.py: -------------------------------------------------------------------------------- 1 | from revscoring.utilities.intersect_merge_observations import \ 2 | intersect_merge_observations 3 | 4 | 5 | def test_intersect_merge(): 6 | """Merge and inspect results. 7 | """ 8 | a = [ 9 | {"rev_id": 101, "goodfaith": False, "damaging": True}, 10 | {"rev_id": 102, "goodfaith": False, "damaging": False}, 11 | ] 12 | b = [ 13 | {"rev_id": 101, "goodfaith": True, "damaging": True} 14 | ] 15 | expected = [ 16 | {"rev_id": 101, "goodfaith": True, "damaging": True} 17 | ] 18 | result = intersect_merge_observations([a, b], "rev_id") 19 | assert expected == list(result) 20 | -------------------------------------------------------------------------------- /revscoring/scoring/statistics/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Statistics represent the fitness of a :class:`revscoring.Model`. They can 3 | be :func:`~revscoring.scoring.Statistics.fit` to scores and labels and 4 | then output using :func:`~revscoring.scoring.Statistics.format`. Once 5 | initialize, a :class:`~revscoring.scoring.Statistics` instance behaves like 6 | a `dict` of statistics values. 7 | 8 | Classification 9 | ++++++++++++++ 10 | .. automodule:: revscoring.scoring.statistics.classification 11 | 12 | Abstract base class 13 | +++++++++++++++++++ 14 | .. automodule:: revscoring.scoring.statistics.statistics 15 | 16 | """ 17 | from .classification import Classification 18 | 19 | __all__ = [Classification] 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython >= 0.28.5 2 | deep_merge >= 0.0.1, < 0.0.999 3 | deltas >= 0.7.0, < 0.7.999 4 | docopt >= 0.6.2, < 0.6.999 5 | gensim >= 3.8.1 6 | hanziconv >= 0.3.2, < 0.3.999 7 | mmh3 >= 2.5.1, < 2.5.999 8 | more-itertools >= 7.2.0, < 7.2.999 9 | mwapi >= 0.5.0, < 0.6.999 10 | mwbase >= 0.1.4, < 0.1.999 11 | mwtypes >= 0.2.0, < 0.3.999 12 | mwparserfromhell >= 0.6.5, < 0.6.999 13 | mysqltsv >= 0.0.7, < 0.0.999 14 | nltk >= 3.6.6 15 | numpy >= 1.21.5, < 1.25.0 16 | pytz >= 2017.2 17 | requests >= 2.0.0, < 2.999.999 18 | pyenchant >= 3.2.2 19 | scipy >= 1.5.4, < 1.10.1 20 | scikit-learn >= 1.0.2 21 | tabulate >= 0.9.0, < 0.9.999 22 | tqdm >= 4.15.0, < 4.15.9999 23 | yamlconf-wmf==0.2.10 24 | flashtext==2.7 25 | -------------------------------------------------------------------------------- /tests/scoring/test_environment.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from pytest import raises 4 | 5 | from revscoring.about import __version__ 6 | from revscoring.scoring.environment import Environment 7 | 8 | 9 | def test_environment(): 10 | env = Environment() 11 | 12 | print(env.format(formatting="str")) 13 | print(json.dumps(env.format(formatting="json"), indent=2)) 14 | assert env.lookup(['revscoring_version']) == __version__ 15 | env.check(raise_exception=True) 16 | 17 | 18 | def test_env_error(): 19 | with raises(RuntimeError): 20 | env = Environment() 21 | env['revscoring_version'] = "foo" 22 | print(json.dumps(env.format(formatting="json"), indent=2)) 23 | env.check(raise_exception=True) 24 | -------------------------------------------------------------------------------- /docs/notes/2015-10-07.language_structure.txt: -------------------------------------------------------------------------------- 1 | - revscoring 2 | - features 3 | - revision 4 | - words 5 | - languages 6 | - english (Language, RegexBadwords, RegexInformals, Dictionary) 7 | - revision 8 | - badwords 9 | - proportion_of_badwords = english.revision.badwords / max(revision.words, 1) 10 | - informals 11 | - proportion_of_informals = english.revision.informals / max(revision.words, 1) 12 | - dict_words 13 | - proportion_of_dictwords 14 | - mispellings = revision.words - english.revision.dict_words 15 | - proportion_of_misspellings = english.revision.misspellings / max(revision.words, 1) 16 | - parent_revision 17 | - ... 18 | - french 19 | - ... 20 | 21 | -------------------------------------------------------------------------------- /tests/scoring/statistics/classification/tests/test_counts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | 4 | from revscoring.scoring.statistics.classification.counts import Counts 5 | 6 | 7 | def test_counts(): 8 | c = Counts( 9 | [True, False], 10 | [({'prediction': True}, True)] * 10 + 11 | [({'prediction': True}, False)] * 20 + 12 | [({'prediction': False}, False)] * 30 + 13 | [({'prediction': False}, True)] * 40, 14 | 'prediction' 15 | ) 16 | 17 | print(c.format_str({})) 18 | print(json.dumps(c.format_json({}), indent=2)) 19 | assert c.lookup("n") == 100 20 | assert c.lookup("labels.true") == 50 21 | assert c.lookup("predictions.false.false") == 30 22 | 23 | pickle.loads(pickle.dumps(c)) 24 | -------------------------------------------------------------------------------- /revscoring/languages/features/dictionary/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements a feature set based off of dictionary lookup. 3 | 4 | .. autoclass:: revscoring.languages.features.Dictionary 5 | :members: 6 | :member-order: bysource 7 | 8 | Supporting classes 9 | ------------------ 10 | 11 | .. autoclass:: revscoring.languages.features.dictionary.Revision 12 | :members: 13 | :member-order: bysource 14 | 15 | .. autoclass:: revscoring.languages.features.dictionary.Diff 16 | :members: 17 | :member-order: bysource 18 | """ 19 | from .dictionary import Dictionary 20 | from .features import Diff, Revision 21 | from .util import utf16_cleanup, load_dict, MultiDictChecker 22 | 23 | __all__ = [Dictionary, utf16_cleanup, load_dict, MultiDictChecker, Revision, Diff] 24 | -------------------------------------------------------------------------------- /revscoring/scoring/models/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of linear classifier models. 3 | 4 | .. autoclass:: revscoring.scoring.models.LogisticRegression 5 | :members: 6 | :member-order: 7 | """ 8 | import logging 9 | 10 | from sklearn.linear_model import LogisticRegression as sklearn_LR 11 | 12 | from .sklearn import ProbabilityClassifier 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class LogisticRegression(ProbabilityClassifier): 18 | """ 19 | Implements a Logistic Regression 20 | """ 21 | Estimator = sklearn_LR 22 | 23 | def __init__(self, *args, label_weights=None, **kwargs): 24 | if label_weights: 25 | logger.warn("LogisticRegression does not support label_weights.") 26 | super().__init__(*args, **kwargs) 27 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/test_hashing.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources.datasource import Datasource 4 | from revscoring.datasources.meta import hashing 5 | from revscoring.dependencies import solve 6 | 7 | my_tokens = Datasource("my_tokens") 8 | my_hashes = hashing.hash(my_tokens, n=10) 9 | 10 | 11 | def test_hashing(): 12 | hashes = solve( 13 | my_hashes, cache={my_tokens: [("one", "two"), "two", "three", "four"]}) 14 | 15 | assert len(hashes) == 4 16 | assert max(hashes) <= 10, str(max(hashes)) 17 | 18 | hashes_again = solve( 19 | my_hashes, cache={my_tokens: [("one", "two"), "two", "three", "four"]}) 20 | 21 | assert hashes == hashes_again 22 | 23 | assert (pickle.loads(pickle.dumps(my_hashes)) == 24 | my_hashes) 25 | -------------------------------------------------------------------------------- /tests/scoring/statistics/classification/tests/test_threshold_optimization.py: -------------------------------------------------------------------------------- 1 | 2 | from revscoring.scoring.statistics.classification.threshold_optimization import \ 3 | ThresholdOptimization 4 | 5 | 6 | def test_threshold_optimization(): 7 | ThresholdOptimization.parse("maximum precision @ recall >= 0.9") 8 | to = ThresholdOptimization.parse("maximum precision @ !recall >= 0.9") 9 | assert to.maximize is True 10 | assert to.target_stat == "precision" 11 | assert to.cond_stat == "!recall" 12 | assert to.greater is True 13 | assert to.cond_value == 0.9 14 | 15 | to = ThresholdOptimization.parse("minimum waffle_monster @ peet <= 0.001") 16 | assert to.maximize is False 17 | assert to.target_stat == "waffle_monster" 18 | assert to.cond_stat == "peet" 19 | assert to.greater is False 20 | assert to.cond_value == 0.001 21 | -------------------------------------------------------------------------------- /revscoring/languages/features/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Dictionary 3 | ++++++++++ 4 | .. automodule :: revscoring.languages.features.dictionary 5 | 6 | RegexMatches 7 | ++++++++++++ 8 | .. automodule :: revscoring.languages.features.matches.regex_matches 9 | 10 | Stopwords 11 | +++++++++ 12 | .. automodule :: revscoring.languages.features.stopwords 13 | 14 | Stemmed 15 | +++++++ 16 | .. automodule :: revscoring.languages.features.stemmed 17 | 18 | SubstringMatches 19 | ++++++++++++++++ 20 | .. automodule :: revscoring.languages.features.matches.substring_matches 21 | 22 | """ 23 | from .dictionary import Dictionary 24 | from .stemmed import Stemmed 25 | from .stopwords import Stopwords 26 | from .matches.regex_matches import RegexMatches 27 | from .matches.substring_matches import SubstringMatches 28 | 29 | __all__ = [Dictionary, RegexMatches, Stemmed, Stopwords, SubstringMatches] 30 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: "CI" 2 | on: 3 | pull_request: 4 | types: [opened, synchronize, reopened, ready_for_review] 5 | 6 | jobs: 7 | ci-merge: 8 | runs-on: ubuntu-latest 9 | container: 10 | image: docker-registry.wikimedia.org/bullseye:20221218 11 | options: --user root 12 | 13 | steps: 14 | - name: Check out repository code 15 | uses: actions/checkout@v3 16 | 17 | - name: Setup Wikimedia Bullseye image 18 | run: | 19 | apt-get update -y 20 | apt-get install python3-pip python3-dev python3-setuptools build-essential python3-enchant g++ gfortran git \ 21 | liblapack-dev libopenblas-dev libenchant-2-2 wget -y 22 | 23 | - name: Run all 24 | run: | 25 | make pip-install 26 | make setup-image 27 | make run-tests 28 | 29 | outputs: 30 | head-status: ${{ job.status }} 31 | -------------------------------------------------------------------------------- /tests/scoring/test_model_info.py: -------------------------------------------------------------------------------- 1 | from pytest import raises 2 | 3 | from revscoring.errors import ModelInfoLookupError 4 | from revscoring.scoring.model_info import ModelInfo 5 | 6 | 7 | def test_model_info(): 8 | mi = ModelInfo(default_fields={'bar', 'foo'}) 9 | mi['foo'] = 1 10 | mi['bar'] = 2 11 | mi['baz'] = 3 12 | mi[True] = 1 13 | 14 | assert 'bar' in mi.format([''], formatting="json") 15 | assert 'baz' not in mi.format([''], formatting="json") 16 | mi.format(['true'], formatting="json") 17 | 18 | assert list(mi.keys()) == ['foo', 'bar', 'baz', True] 19 | assert list(mi.format_json({}).keys()) == ['foo', 'bar'] 20 | 21 | 22 | def test_model_info_error(): 23 | with raises(ModelInfoLookupError) as e: 24 | mi = ModelInfo() 25 | mi['baz'] = 3 26 | mi[True] = 1 27 | 28 | mi.format(['false']) 29 | assert e.value.args[0] == 'false' 30 | -------------------------------------------------------------------------------- /tests/features/bytes/tests/test_revision_oriented.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources import revision_oriented 4 | from revscoring.dependencies import solve 5 | from revscoring.features.bytes.revision_oriented import revision 6 | 7 | length_change = revision.parent.length - revision.length 8 | 9 | 10 | def test_length(): 11 | cache = {revision_oriented.revision.parent.text: "I am ascii", 12 | revision_oriented.revision.text: "地を南北に縦走する"} 13 | 14 | assert solve(revision.length, cache=cache) == 27 15 | assert solve(revision.parent.length, cache=cache) == 10 16 | assert solve(length_change, cache=cache) == -17 17 | 18 | assert pickle.loads(pickle.dumps(revision.length)) == revision.length 19 | assert (pickle.loads(pickle.dumps(revision.parent.length)) == 20 | revision.parent.length) 21 | assert pickle.loads(pickle.dumps(length_change)) == length_change 22 | -------------------------------------------------------------------------------- /tests/scoring/models/tests/test_model.py: -------------------------------------------------------------------------------- 1 | 2 | from pytest import mark 3 | 4 | from revscoring.features import Feature 5 | from revscoring.scoring.models.model import Classifier, Learned, Model 6 | 7 | 8 | def test_model(): 9 | m = Model([Feature("foo", returns=int)], version="0.0.1") 10 | 11 | assert m.info.lookup('version') == "0.0.1" 12 | 13 | 14 | def test_from_config(): 15 | config = { 16 | 'scorer_models': { 17 | 'test': { 18 | 'module': "pytest.mark" 19 | } 20 | } 21 | } 22 | model = Model.from_config(config, 'test') 23 | assert model == mark 24 | 25 | 26 | def test_learned_model(): 27 | model = Learned([Feature("foo", returns=int)]) 28 | assert model.trained is None 29 | 30 | 31 | def test_classifier(): 32 | model = Classifier([Feature("foo", returns=int)], [True, False]) 33 | assert 'statustics' not in model.info 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Dockerfile for building a Docker container. See https://www.docker.com/ 2 | # Install wikimedia runscoring, with dependencies 3 | # See: https://github.com/wikimedia/revscoring 4 | 5 | # Build via docker build --rm -t nealmcb/revscoring:0.3 . 6 | 7 | FROM jupyter/notebook 8 | 9 | RUN DEBIAN_FRONTEND=noninteractive apt-get update && DEBIAN_FRONTEND=noninteractive apt-get upgrade -y 10 | 11 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \ 12 | python3-dev \ 13 | python3-numpy \ 14 | python3-scipy \ 15 | g++ \ 16 | gfortran \ 17 | liblapack-dev \ 18 | libopenblas-dev \ 19 | myspell-pt \ 20 | myspell-fa \ 21 | myspell-en-au \ 22 | myspell-en-gb \ 23 | myspell-en-us \ 24 | myspell-en-za \ 25 | myspell-fr \ 26 | myspell-es \ 27 | myspell-he \ 28 | hunspell-vi \ 29 | aspell-id 30 | 31 | RUN pip3 install --user revscoring 32 | 33 | RUN python3 -m nltk.downloader stopwords 34 | -------------------------------------------------------------------------------- /revscoring/languages/features/stemmed/stemmed.py: -------------------------------------------------------------------------------- 1 | from ....dependencies import DependentSet 2 | from ....features import wikitext 3 | from . import datasources, features 4 | 5 | 6 | class Stemmed(DependentSet): 7 | """ 8 | :Parameters: 9 | name : `str` 10 | A name for the collection 11 | stem_word : `func` 12 | A function that, give a word, will return a stemmed version of that 13 | word 14 | """ 15 | 16 | def __init__(self, name, stem_word): 17 | super().__init__(name) 18 | self.revision = features.Revision( 19 | name + ".revision", 20 | datasources.Revision(name + ".revision", stem_word, 21 | wikitext.revision.datasources) 22 | ) 23 | """ 24 | :class:`~revscoring.languages.features.stemmed.Revision` : 25 | The base revision feature set. 26 | """ 27 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/test_dicts.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources.datasource import Datasource 4 | from revscoring.datasources.meta import dicts 5 | from revscoring.dependencies import solve 6 | 7 | my_dict = Datasource("my_dict") 8 | 9 | my_keys = dicts.keys(my_dict) 10 | my_values = dicts.values(my_dict) 11 | 12 | 13 | def test_dict_keys(): 14 | cache = {my_dict: {"foo": 1, "bar": 2}} 15 | assert set(solve(my_keys, cache=cache)) == {"foo", "bar"} 16 | cache = {my_dict: None} 17 | assert set(solve(my_keys, cache=cache)) == set() 18 | 19 | assert pickle.loads(pickle.dumps(my_keys)) == my_keys 20 | 21 | 22 | def test_dict_values(): 23 | cache = {my_dict: {"foo": 1, "bar": 2}} 24 | assert set(solve(my_values, cache=cache)) == {1, 2} 25 | cache = {my_dict: None} 26 | assert set(solve(my_values, cache=cache)) == set() 27 | 28 | assert pickle.loads(pickle.dumps(my_values)) == my_values 29 | -------------------------------------------------------------------------------- /revscoring/datasources/meta/timestamp.py: -------------------------------------------------------------------------------- 1 | """ 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that 3 | return `mwtypes.Timestamp` of the given string. 4 | 5 | .. autoclass:: revscoring.datasources.meta.timestamp.Timestamp 6 | """ 7 | import mwtypes 8 | 9 | from ..datasource import Datasource 10 | 11 | MW_REGISTRATION_EPOCH = '2006-01-01T00:00:00Z' 12 | 13 | 14 | class Timestamp(Datasource): 15 | """ 16 | Generates a mwtypes.Timestamp of the given string 17 | 18 | :Parameters: 19 | timestamp_str : `str` 20 | Timestamp string in ISO format. 21 | name : `str` 22 | A name for the datasource. 23 | """ 24 | 25 | def __init__(self, timestamp_str, name=None): 26 | super().__init__(name, self.process, 27 | depends_on=[timestamp_str]) 28 | 29 | def process(self, timestamp_str): 30 | return mwtypes.Timestamp(timestamp_str or MW_REGISTRATION_EPOCH) 31 | -------------------------------------------------------------------------------- /revscoring/utilities/check_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``revscoring check_model -h`` 3 | :: 4 | 5 | Compares a models construction environment snapshot to the current 6 | environment. 7 | 8 | Usage: 9 | check_model -h | --help 10 | check_model [--raise-exception] 11 | 12 | Options: 13 | -h --help Prints this documentation 14 | Path to a model file 15 | --raise-exception Causes an error return state if there are 16 | inconsistencies between the current environment 17 | and the model's build environment. 18 | """ 19 | import docopt 20 | 21 | from ..scoring import Model, models 22 | 23 | 24 | def main(argv=None): 25 | args = docopt.docopt(__doc__, argv=argv) 26 | raise_exception = args['--raise-exception'] 27 | Model.load(models.open_file(args['']), 28 | error_on_env_check=raise_exception) 29 | -------------------------------------------------------------------------------- /revscoring/languages/features/dictionary/dictionary.py: -------------------------------------------------------------------------------- 1 | from ....dependencies import DependentSet 2 | from ....features import wikitext 3 | from . import datasources, features 4 | 5 | 6 | class Dictionary(DependentSet): 7 | """ 8 | :Parameters: 9 | name : `str` 10 | A name for the collection 11 | dictionary_check : `func` 12 | A function that, given a word, performs a dictionary check and 13 | returns True if the word exists. 14 | """ 15 | 16 | def __init__(self, name, dictionary_check): 17 | super().__init__(name) 18 | self.revision = features.Revision( 19 | name + ".revision", 20 | datasources.Revision(name + ".revision", dictionary_check, 21 | wikitext.revision.datasources) 22 | ) 23 | """ 24 | :class:`~revscoring.languages.features.dictionary.Revision` : 25 | The base revision feature set. 26 | """ 27 | -------------------------------------------------------------------------------- /revscoring/datasources/datasource.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. autoclass:: revscoring.Datasource 3 | :member-order: 4 | :inherited-members: 5 | """ 6 | from ..dependencies import Dependent 7 | 8 | 9 | class Datasource(Dependent): 10 | """ 11 | Represents a data source for generating features. Unlike features, 12 | datasources do not necessarily generate simple scalar values. 13 | 14 | :Parameters: 15 | name : str 16 | The name of the feature 17 | process : `func` 18 | A function that will generate a data value 19 | depends_on : `list`(`hashable`) 20 | An ordered list of dependencies that correspond 21 | to the `*args` of `process` 22 | """ 23 | 24 | def __init__(self, *args, **kwargs): 25 | super().__init__(*args, **kwargs) 26 | 27 | def __hash__(self): 28 | return hash('datasource.' + self.name) 29 | 30 | def __str__(self): 31 | return "datasource." + self.name 32 | -------------------------------------------------------------------------------- /tests/scoring/statistics/classification/tests/test_multilabel_counts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pickle 3 | 4 | from revscoring.scoring.statistics.classification.counts import \ 5 | MultilabelCounts 6 | 7 | 8 | def test_counts(): 9 | c = MultilabelCounts( 10 | ["foo", "bar", "baz"], 11 | [({'prediction': ["foo"]}, ["foo", "bar"])] * 10 + 12 | [({'prediction': ["foo", "bar", "baz"]}, ["foo", "baz"])] * 20 + 13 | [({'prediction': ["bar"]}, ["bar"])] * 30 + 14 | [({'prediction': ["baz"]}, ["bar", "baz"])] * 40, 15 | 'prediction' 16 | ) 17 | 18 | print(c.format_str({})) 19 | print(json.dumps(c.format_json({}), indent=2)) 20 | assert c.lookup("n") == 100 21 | assert c.lookup("labels.foo") == 30 22 | assert c.lookup("predictions.foo.true.false") == 0 23 | assert c.lookup("predictions.foo.true.true") == 30 24 | assert c.lookup("predictions.bar.false.true") == 20 25 | 26 | pickle.loads(pickle.dumps(c)) 27 | -------------------------------------------------------------------------------- /revscoring/languages/features/matches/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements a feature set based off of a set of regexes applied to strings. 3 | 4 | .. autoclass:: revscoring.languages.features.RegexMatches 5 | :members: 6 | :member-order: bysource 7 | 8 | Implements a feature set based off of a set of substrings applied to strings. 9 | 10 | .. autoclass:: revscoring.languages.features.SubstringMatches 11 | :members: 12 | :member-order: bysource 13 | 14 | Supporting classes 15 | ------------------ 16 | 17 | .. autoclass:: revscoring.languages.features.matches.Revision 18 | :members: 19 | :member-order: bysource 20 | 21 | .. autoclass:: revscoring.languages.features.matches.Diff 22 | :members: 23 | :member-order: bysource 24 | """ 25 | from .features import Diff, Revision 26 | from .matches import Matches 27 | from .substring_matches import SubstringMatches 28 | from .regex_matches import RegexMatches 29 | 30 | __all__ = [Matches, RegexMatches, SubstringMatches, Revision, Diff] 31 | -------------------------------------------------------------------------------- /tests/features/test_modifiers.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from math import log as math_log 3 | 4 | from revscoring.dependencies import solve 5 | from revscoring.features import modifiers 6 | 7 | 8 | def test_log(): 9 | log_five = modifiers.log(5) 10 | 11 | assert solve(log_five) == math_log(5) 12 | 13 | assert solve(pickle.loads(pickle.dumps(log_five))) == math_log(5) 14 | 15 | assert repr(log_five) == "" 16 | 17 | 18 | def test_max(): 19 | 20 | max_five_six_seven = modifiers.max(5, 6, 7) 21 | 22 | assert solve(max_five_six_seven) == 7 23 | 24 | assert solve(pickle.loads(pickle.dumps(max_five_six_seven))) == 7 25 | 26 | assert repr(max_five_six_seven) == "" 27 | 28 | 29 | def test_min(): 30 | 31 | min_five_six_seven = modifiers.min(5, 6, 7) 32 | 33 | assert solve(min_five_six_seven) == 5 34 | 35 | assert pickle.loads(pickle.dumps(min_five_six_seven)) == min_five_six_seven 36 | 37 | assert repr(min_five_six_seven) == "" 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *~ 5 | ipython/.ipynb_checkpoints 6 | .python-version 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Demo files 12 | demo_*.py 13 | 14 | # Datasets & Models 15 | datasets/ 16 | models/*.model 17 | 18 | # Distribution / packaging 19 | .Python 20 | env/ 21 | bin/ 22 | build/ 23 | docs/_build/ 24 | develop-eggs/ 25 | dist/ 26 | eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | _build/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | 49 | # Translations 50 | *.mo 51 | 52 | # Mr Developer 53 | .mr.developer.cfg 54 | .project 55 | .pydevproject 56 | 57 | # Rope 58 | .ropeproject 59 | 60 | # Django stuff: 61 | *.log 62 | *.pot 63 | 64 | # Sphinx documentation 65 | doc/_build/ 66 | 67 | # pyCharm, IntelliJ 68 | *.idea 69 | *.iml 70 | -------------------------------------------------------------------------------- /docs/api_reference.rst: -------------------------------------------------------------------------------- 1 | .. _api-reference: 2 | 3 | :orphan: 4 | 5 | revscoring package 6 | ================== 7 | 8 | Subpackages 9 | ----------- 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | 14 | revscoring.dependencies 15 | revscoring.datasources 16 | revscoring.datasources.meta 17 | revscoring.datasources.revision_oriented 18 | revscoring.extractors 19 | revscoring.features 20 | revscoring.features.meta 21 | revscoring.features.modifiers 22 | revscoring.features.revision_oriented 23 | revscoring.features.bytes 24 | revscoring.features.temporal 25 | revscoring.features.wikibase 26 | revscoring.features.wikitext 27 | revscoring.languages 28 | revscoring.languages.features 29 | revscoring.scoring 30 | revscoring.scoring.models 31 | revscoring.scoring.statistics 32 | revscoring.utilities 33 | 34 | Submodules 35 | ---------- 36 | 37 | revscoring.errors module 38 | ------------------------ 39 | 40 | .. automodule:: revscoring.errors 41 | :members: 42 | :undoc-members: 43 | -------------------------------------------------------------------------------- /tests/features/test_functions.py: -------------------------------------------------------------------------------- 1 | 2 | from revscoring.datasources import Datasource 3 | from revscoring.features.feature import Constant, Feature 4 | from revscoring.features.feature_vector import FeatureVector 5 | from revscoring.features.functions import trim, vectorize_values 6 | from revscoring.features.modifiers import log, max 7 | 8 | 9 | def test_trim(): 10 | 11 | d1 = Datasource("derp1") 12 | f1 = Feature("foobar1", returns=int) 13 | f2 = Feature("foobar2", returns=int, depends_on=[d1]) 14 | c = Constant(value=5) 15 | fv = FeatureVector("foobar3", returns=int, depends_on=[c]) 16 | 17 | assert list(trim(f1)) == [f1] 18 | assert list(trim([f1, f2, fv])) == [f1, f2, fv] 19 | assert list(trim([f1, f2, f1 + f2, fv])) == [f1, f2, fv] 20 | assert (list(trim(log(max(f1 - f2, 1)))) == 21 | [f1, f2]) 22 | 23 | 24 | def test_vectorize_features(): 25 | 26 | feature_values = [1, 2.0, [1.0, 2.0, 3.0], False] 27 | assert (vectorize_values(feature_values) == 28 | [1, 2.0, 1.0, 2.0, 3.0, False]) 29 | -------------------------------------------------------------------------------- /revscoring/languages/features/matches/substring_matches.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements a feature set based off of list of regular expressions to match. 3 | 4 | .. autoclass:: revscoring.languages.features.SubstringMatches 5 | :members: 6 | :member-order: bysource 7 | """ 8 | from . import Matches 9 | from ....datasources.meta import extractors 10 | 11 | 12 | class SubstringMatches(Matches): 13 | """ 14 | :Parameters: 15 | name : `str` 16 | A name for the collection 17 | substrings : `list` ( `str` ) 18 | A list of substrings to match. 19 | exclusions : `list` ( `str` ) 20 | A list of substrings to explicitly not match 21 | """ 22 | 23 | def __init__(self, name, substrings, exclusions=None, 24 | text_preprocess=None): 25 | matcher = extractors.trie(substrings, 26 | exclusions=exclusions).process 27 | super().__init__(name, matcher, substrings, exclusions, 28 | text_preprocess=text_preprocess) 29 | -------------------------------------------------------------------------------- /tests/scoring/test_util.py: -------------------------------------------------------------------------------- 1 | 2 | from revscoring.scoring import util 3 | 4 | 5 | def test_pattern(): 6 | assert (util.parse_pattern("'maximum filter_rate @ recall >= 0.9'.labels.true") == # noqa 7 | ["maximum filter_rate @ recall >= 0.9", "labels", "true"]) 8 | assert (util.parse_pattern("'maximum filter_rate @ recall >= 0.9'.'labels'.true") == # noqa 9 | ["maximum filter_rate @ recall >= 0.9", "labels", "true"]) 10 | assert (util.parse_pattern("'foo\"bar\"'.buz") == # noqa 11 | ["foo\"bar\"", "buz"]) 12 | 13 | 14 | def test_treeify(): 15 | paths = (util.parse_pattern(p) 16 | for p in ['foo.bar.baz', 'foo.bar.buz', 'foo.bar', 'bum']) 17 | assert (util.treeify(paths) == 18 | {'foo': {'bar': {'baz': {}, 'buz': {}}}, 'bum': {}}) 19 | assert util.treeify([util.parse_pattern("")]) == {} 20 | 21 | 22 | def test_dict_lookup(): 23 | r = util.dict_lookup({'foo': {'bar': {'baz': 1}}, 'bum': {'derp': 2}}, 24 | {'foo': {'bar': {}}}) 25 | assert r == {'foo': {'bar': {'baz': 1}}} 26 | -------------------------------------------------------------------------------- /revscoring/scoring/models/svc.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of Support Vector Machine type classifier models. 3 | 4 | .. autoclass:: revscoring.scoring.models.LinearSVC 5 | :members: 6 | :member-order: 7 | 8 | .. autoclass:: revscoring.scoring.models.RBFSVC 9 | :members: 10 | :member-order: 11 | 12 | .. autoclass:: revscoring.scoring.models.SVC 13 | :members: 14 | :member-order: 15 | 16 | """ 17 | from sklearn import svm 18 | 19 | from .sklearn import ProbabilityClassifier 20 | 21 | 22 | class SVC(ProbabilityClassifier): 23 | """ 24 | Implements a Support Vector Classifier model. 25 | """ 26 | Estimator = svm.SVC 27 | BASE_PARAMS = {'probability': True} 28 | 29 | 30 | class LinearSVC(SVC): 31 | """ 32 | Implements a Support Vector Classifier model with a Linear kernel. 33 | """ 34 | BASE_PARAMS = {'probability': True, 'kernel': "linear"} 35 | 36 | 37 | class RBFSVC(SVC): 38 | """ 39 | Implements a Support Vector Classifier model with an RBF kernel. 40 | """ 41 | BASE_PARAMS = {'probability': True, 'kernel': "rbf"} 42 | -------------------------------------------------------------------------------- /revscoring/datasources/meta/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Meta-Features are classes that extend :class:`~revscoring.Datasource` and 3 | implement common operations on other :class:`~revscoring.Datasource`. 4 | 5 | dicts 6 | +++++ 7 | .. automodule:: revscoring.datasources.meta.dicts 8 | 9 | extractors 10 | ++++++++++ 11 | .. automodule:: revscoring.datasources.meta.extractors 12 | 13 | filters 14 | +++++++ 15 | .. automodule:: revscoring.datasources.meta.filters 16 | 17 | frequencies 18 | +++++++++++ 19 | .. automodule:: revscoring.datasources.meta.frequencies 20 | 21 | gramming 22 | ++++++++ 23 | .. automodule:: revscoring.datasources.meta.gramming 24 | 25 | hashing 26 | +++++++ 27 | .. automodule:: revscoring.datasources.meta.hashing 28 | 29 | indexable 30 | +++++++++ 31 | .. automodule:: revscoring.datasources.meta.indexable 32 | 33 | mappers 34 | +++++++ 35 | .. automodule:: revscoring.datasources.meta.mappers 36 | 37 | selectors 38 | +++++++++ 39 | .. automodule:: revscoring.datasources.meta.selectors 40 | 41 | timestamp 42 | +++++++++ 43 | .. automodule:: revscoring.datasources.meta.timestamp 44 | """ 45 | -------------------------------------------------------------------------------- /revscoring/scoring/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Scoring is what the `revscoring` library was designed to do. The basics of 3 | scoring are :class:`revscoring.Model` that implement 4 | :func:`~revscoring.Model.score` and :class:`revscoring.scoring.Statistics` that 5 | are :func:`~revscoring.scoring.Statistics.fit` using the scores generated by a 6 | :class:`revscoring.Model`. Prediction models are fragile, so models keep track 7 | of their :class:`revscoring.scoring.Environment` and you can 8 | :func:`revscoring.scoring.Environment.check` them against the current 9 | environment. 10 | 11 | See :mod:`revscoring.scoring.models` and :mod:`revscoring.scoring.statistics` 12 | for more information. 13 | 14 | .. autoclass:: revscoring.Model 15 | :members: 16 | 17 | .. autoclass:: revscoring.scoring.ModelInfo 18 | :members: 19 | 20 | .. autoclass:: revscoring.scoring.Environment 21 | :members: 22 | """ 23 | from .environment import Environment 24 | from .model_info import ModelInfo 25 | from .models.model import Model 26 | from .statistics.statistics import Statistics 27 | 28 | __all__ = [Model, ModelInfo, Statistics, Environment] 29 | -------------------------------------------------------------------------------- /revscoring/scoring/models/naive_bayes.py: -------------------------------------------------------------------------------- 1 | """ 2 | A collection of Naive Bayes type classifier models. 3 | 4 | .. autoclass:: revscoring.scoring.models.GaussianNB 5 | :members: 6 | :member-order: 7 | 8 | .. autoclass:: revscoring.scoring.models.MultinomialNB 9 | :members: 10 | :member-order: 11 | 12 | .. autoclass:: revscoring.scoring.models.BernoulliNB 13 | :members: 14 | :member-order: 15 | """ 16 | import logging 17 | 18 | from sklearn import naive_bayes 19 | 20 | from .sklearn import ProbabilityClassifier 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class NaiveBayes(ProbabilityClassifier): 26 | pass 27 | 28 | 29 | class GaussianNB(NaiveBayes): 30 | """ 31 | Implements a Gaussian Naive Bayes model 32 | """ 33 | Estimator = naive_bayes.GaussianNB 34 | 35 | 36 | class MultinomialNB(NaiveBayes): 37 | """ 38 | Implements a Multinomial Naive Bayes model 39 | """ 40 | Estimator = naive_bayes.MultinomialNB 41 | 42 | 43 | class BernoulliNB(NaiveBayes): 44 | """ 45 | Implements a Bernoulli Naive Bayes model 46 | """ 47 | Estimator = naive_bayes.BernoulliNB 48 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | pip-install: requirements.txt test-requirements.txt 2 | pip install -r requirements.txt 3 | pip install -r test-requirements.txt 4 | 5 | .PHONY: run-tests 6 | run-tests: 7 | python3 -m pytest tests/ -v --cov 8 | 9 | .PHONY: setup-image 10 | setup-image: 11 | apt-get install \ 12 | hunspell-ar \ 13 | aspell-bn \ 14 | aspell-el \ 15 | hunspell-id \ 16 | hunspell-en-us \ 17 | aspell-is \ 18 | aspell-pl \ 19 | aspell-ro \ 20 | aspell-sv \ 21 | aspell-ta \ 22 | aspell-uk \ 23 | hunspell-cs \ 24 | hunspell-de-at \ 25 | hunspell-de-ch \ 26 | hunspell-de-de \ 27 | hunspell-es \ 28 | hunspell-et \ 29 | myspell-fa \ 30 | hunspell-fr \ 31 | hunspell-he \ 32 | hunspell-hr \ 33 | aspell-hu \ 34 | hunspell-lv \ 35 | myspell-nb \ 36 | hunspell-nl \ 37 | hunspell-pt-pt \ 38 | hunspell-pt-br \ 39 | hunspell-ru \ 40 | hunspell-hr \ 41 | hunspell-bs \ 42 | hunspell-ca \ 43 | hunspell-en-au \ 44 | hunspell-en-us \ 45 | hunspell-en-gb \ 46 | hunspell-eu \ 47 | hunspell-gl \ 48 | hunspell-it \ 49 | hunspell-hi \ 50 | hunspell-sr \ 51 | hunspell-vi \ 52 | -y 53 | python3 -m nltk.downloader omw sentiwordnet stopwords wordnet 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Aaron Halfaker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /revscoring/datasources/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements a set of 3 | :class:`~revscoring.Datasource` 4 | processors that represent the input data for extracting 5 | :class:`~revscoring.Feature` values. Just like 6 | :class:`~revscoring.Feature` and other 7 | :class:'~revscoring.Dependent' processors, 8 | :class:`~revscoring.Datasource` processors are tended to 9 | be :func:`~revscoring.dependencies.solve`'d as dependencies. The 10 | provided datasources are split conceptually into a set of modules. Currently, 11 | there is one module: :mod:`~revscoring.datasources.revision_oriented`. 12 | 13 | Meta-datasources 14 | ++++++++++++++++ 15 | Meta-Features are classes that extend :class:`~revscoring.Datasource` and 16 | implement common operations on :class:`~revscoring.Datasource` like 17 | :class:`~revscoring.datasources.meta.filters.filter` and 18 | :class:`~revscoring.datasources.meta.mappers.map`. 19 | See :mod:`revscoring.datasources.meta` for the full list. 20 | 21 | Base classes 22 | ++++++++++++ 23 | .. automodule:: revscoring.datasources.datasource 24 | 25 | 26 | 27 | 28 | """ 29 | from .datasource import Datasource 30 | 31 | __all__ = [Datasource] 32 | -------------------------------------------------------------------------------- /revscoring/languages/features/stopwords/stopwords.py: -------------------------------------------------------------------------------- 1 | from ....dependencies import DependentSet 2 | from ....features import wikitext 3 | from . import datasources, features 4 | 5 | 6 | class Stopwords(DependentSet): 7 | """ 8 | :Parameters: 9 | name : `str` 10 | A name for the collection 11 | stopword_set : `set` ( `str` ) 12 | A set of stopwords 13 | """ 14 | 15 | def __init__(self, name, stopword_set): 16 | super().__init__(name) 17 | word_is_stopword = WordIsInStopwordSet(stopword_set) 18 | 19 | self.revision = features.Revision( 20 | name + ".revision", 21 | datasources.Revision(name + ".revision", word_is_stopword, 22 | wikitext.revision.datasources) 23 | ) 24 | """ 25 | :class:`~revscoring.languages.features.stopwords.Revision` : 26 | The base revision feature set. 27 | """ 28 | 29 | 30 | class WordIsInStopwordSet: 31 | 32 | def __init__(self, stopword_set, cleanup=None): 33 | self.stopword_set = stopword_set 34 | 35 | def __call__(self, word): 36 | return word.lower() in self.stopword_set 37 | -------------------------------------------------------------------------------- /revscoring/features/temporal/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This features module provides access to features of the bytes of content in 3 | revisions. 4 | 5 | .. autodata:: revscoring.features.temporal.revision 6 | 7 | Supporting classes 8 | ++++++++++++++++++ 9 | 10 | .. autoclass:: revscoring.features.temporal.Revision 11 | :members: 12 | :member-order: bysource 13 | 14 | .. autoclass:: revscoring.features.temporal.ParentRevision 15 | :members: 16 | :member-order: bysource 17 | 18 | .. autoclass:: revscoring.features.temporal.LastUserRevision 19 | :members: 20 | :member-order: bysource 21 | 22 | .. autoclass:: revscoring.features.temporal.PageCreation 23 | :members: 24 | :member-order: bysource 25 | 26 | .. autoclass:: revscoring.features.temporal.Page 27 | :members: 28 | :member-order: bysource 29 | 30 | .. autoclass:: revscoring.features.temporal.User 31 | :members: 32 | :member-order: bysource 33 | 34 | """ 35 | from .revision_oriented import (LastUserRevision, Page, PageCreation, 36 | ParentRevision, Revision, User, revision) 37 | 38 | __all__ = [revision, Revision, ParentRevision, LastUserRevision, PageCreation, 39 | Page, User] 40 | -------------------------------------------------------------------------------- /revscoring/languages/features/matches/regex_matches.py: -------------------------------------------------------------------------------- 1 | """ 2 | Implements a feature set based off of list of regular expressions to match. 3 | 4 | .. autoclass:: revscoring.languages.features.RegexMatches 5 | :members: 6 | :member-order: bysource 7 | """ 8 | from . import Matches 9 | from ....datasources.meta import extractors 10 | 11 | 12 | class RegexMatches(Matches): 13 | """ 14 | :Parameters: 15 | name : `str` 16 | A name for the collection 17 | regexes : `list` ( `str` ) 18 | A list of regex patterns to match. 19 | exclusions : `list` ( `str` ) 20 | A list of terms to explicitly not match 21 | wrapping : `tuple` ( `str`, `str` ) 22 | Insert these characters around matches in the regular expression 23 | """ 24 | 25 | def __init__(self, name, regexes, exclusions=None, 26 | wrapping=(r'\b', r'\b'), text_preprocess=None): 27 | matcher = extractors.regex(regexes, wrapping=wrapping, 28 | exclusions=exclusions).process 29 | super().__init__(name, matcher, regexes, exclusions, 30 | text_preprocess) 31 | -------------------------------------------------------------------------------- /tests/features/meta/tests/test_vectorizers.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources import Datasource 4 | from revscoring.dependencies import solve 5 | from revscoring.features.meta import vectorizers 6 | 7 | my_dict = Datasource("my_dict") 8 | 9 | 10 | class KeysDict(Datasource): 11 | 12 | def __init__(self, name, keys): 13 | super().__init__(name) 14 | self._keys = keys 15 | 16 | def keys(self): 17 | return self._keys 18 | 19 | 20 | my_keys_dict = KeysDict("my_keys_dict", ["a", "b", "c"]) 21 | 22 | 23 | def test_vectorize(): 24 | my_vector = vectorizers.vectorize( 25 | my_dict, ["a", "b", "c"], returns=int) 26 | 27 | assert (solve(my_vector, cache={my_dict: {"a": 5}}) == 28 | [5, 0, 0]) 29 | assert (solve(my_vector, cache={my_dict: {"d": 5}}) == 30 | [0, 0, 0]) 31 | assert (solve(my_vector, cache={my_dict: {"a": 1, "b": 2, "c": 3}}) == 32 | [1, 2, 3]) 33 | 34 | assert pickle.loads(pickle.dumps(my_vector)) == my_vector 35 | 36 | my_keys_vector = vectorizers.vectorize(my_keys_dict, returns=int) 37 | 38 | assert (solve(my_keys_vector, cache={my_keys_dict: {"a": 1, "b": 2, "c": 3}}) == 39 | [1, 2, 3]) 40 | -------------------------------------------------------------------------------- /revscoring/features/feature_vector.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. autoclass:: revscoring.FeatureVector 3 | :members: 4 | """ 5 | from revscoring.features import Feature 6 | 7 | 8 | class FeatureVector(Feature): 9 | """ 10 | Represents a vector of predictive features. 11 | 12 | :Parameters: 13 | name : str 14 | The name of the feature 15 | process : `func` 16 | A function that will generate a feature value 17 | returns : `type` 18 | A type to compare the return vector of this function to. 19 | dependencies : `list`(`hashable`) 20 | An ordered list of dependencies that correspond 21 | to the `*args` of `process` 22 | """ 23 | 24 | def validate(self, vector): 25 | for i, value in enumerate(vector): 26 | if not isinstance(value, self.returns): 27 | raise ValueError( 28 | "Expected {0}, but got {1} instead at position {2}." 29 | .format(self.returns, type(value), i)) 30 | 31 | return vector 32 | 33 | def __hash__(self): 34 | return hash('feature_vector.' + self.name) 35 | 36 | def __str__(self): 37 | return "feature_vector." + self.name 38 | -------------------------------------------------------------------------------- /revscoring/features/modifiers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Modifiers provide convenient mechanisms for modifying and combining 3 | :class:`revscoring.Feature` and constant values into new 4 | :class:`revscoring.Feature`. 5 | 6 | .. autofunction:: revscoring.features.modifiers.log 7 | 8 | ---- 9 | 10 | .. autofunction:: revscoring.features.modifiers.min 11 | .. autofunction:: revscoring.features.modifiers.max 12 | 13 | ---- 14 | 15 | .. autofunction:: revscoring.features.modifiers.add 16 | .. autofunction:: revscoring.features.modifiers.sub 17 | .. autofunction:: revscoring.features.modifiers.mul 18 | .. autofunction:: revscoring.features.modifiers.div 19 | 20 | ---- 21 | 22 | .. autofunction:: revscoring.features.modifiers.eq 23 | .. autofunction:: revscoring.features.modifiers.ne 24 | .. autofunction:: revscoring.features.modifiers.gt 25 | .. autofunction:: revscoring.features.modifiers.lt 26 | .. autofunction:: revscoring.features.modifiers.ge 27 | .. autofunction:: revscoring.features.modifiers.le 28 | 29 | """ 30 | from .feature import (add, and_, div, eq, ge, gt, le, log, lt, max, min, mul, 31 | ne, not_, or_, sub) 32 | 33 | __all__ = [add, div, eq, ge, gt, le, log, lt, max, min, mul, ne, sub, and_, 34 | or_, not_] 35 | -------------------------------------------------------------------------------- /tests/scoring/test_labels.py: -------------------------------------------------------------------------------- 1 | from revscoring.scoring.labels import Binarizer, ClassVerifier 2 | 3 | 4 | def test_class_verifier(): 5 | label_set = [True, False] 6 | cv = ClassVerifier(label_set) 7 | labels = [True, True, False, True, False, False] 8 | cv.check_label_consistency(labels) 9 | normalized_labels = cv.normalize(True) 10 | assert normalized_labels 11 | 12 | 13 | def test_binarizer(): 14 | label_set = ['A', 'B', 'C', 'D'] 15 | labels = [['A', 'B'], ['B', 'D'], ['A', 'B', 'C', 'D'], ['B', 'C']] 16 | binarizer = Binarizer(label_set) 17 | binarizer.check_label_consistency(labels) 18 | normalized_labels = binarizer.normalize(labels[1]) 19 | normalized_labels_actual = [0, 1, 0, 1] 20 | assert normalized_labels == normalized_labels_actual 21 | 22 | denormalized_labels = binarizer.denormalize(normalized_labels_actual) 23 | assert denormalized_labels == labels[1] 24 | 25 | 26 | def test_label_weights_normalizer(): 27 | label_weights = {'A': 0.4, 'B': 0.6} 28 | label_set = ['A', 'B'] 29 | binarizer = Binarizer(label_set) 30 | expected_label_weights = [{0: 1, 1: 0.4}, {0: 1, 1: 0.6}] 31 | assert expected_label_weights == \ 32 | binarizer.normalize_weights(label_weights) 33 | -------------------------------------------------------------------------------- /tests/languages/util.py: -------------------------------------------------------------------------------- 1 | from revscoring.datasources import revision_oriented as ro 2 | from revscoring.dependencies import solve 3 | 4 | 5 | def simple_eq(a, b): 6 | return a == b 7 | 8 | 9 | def compare_extraction(extractor, examples, counter_examples, 10 | lwrap="", rwrap="", eq=simple_eq): 11 | def process(text): 12 | return solve(extractor, cache={ro.revision.text: text}) 13 | 14 | for example in examples: 15 | wrapped = lwrap + example + rwrap 16 | assert eq(process(wrapped), [example]), \ 17 | " ".join([repr(wrapped), str(process(wrapped)), str([example])]) 18 | assert eq(process( 19 | "Sentence " + 20 | wrapped + 21 | " sandwich."), [example]) 22 | assert eq(process("Sentence end " + wrapped + "."), [example]) 23 | assert eq(process(wrapped + " start of sentence."), [example]) 24 | 25 | for example in counter_examples: 26 | wrapped = lwrap + example + rwrap 27 | assert process(wrapped) == [], process(wrapped) 28 | assert process("Sentence " + wrapped + " sandwich.") == [] 29 | assert process("Sentence end " + wrapped + ".") == [] 30 | assert process(wrapped + " start of sentence.") == [] 31 | -------------------------------------------------------------------------------- /revscoring/scoring/statistics/statistics.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. autoclass:: revscoring.scoring.Statistics 3 | :members: 4 | :inherited-members: 5 | :member-order: 6 | """ 7 | import logging 8 | 9 | from ..model_info import ModelInfo 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class Statistics(ModelInfo): 15 | """ 16 | Construct a set of Statistics. Instances of this class work like a 17 | `dict` of statistical values once 18 | :func:`revscoring.scoring.Statistics.fit` is called. 19 | """ 20 | def __init__(self, *args, **kwargs): 21 | super().__init__(*args, **kwargs) 22 | self.fitted = False 23 | 24 | def fit(self, score_labels): 25 | """ 26 | Fit to scores and labels. 27 | 28 | :Parameters: 29 | score_labels : [( `dict`, `mixed` )] 30 | A collection of scores-label pairs generated using 31 | :class:`revscoring.Model.score`. Note that fitting is usually 32 | done using data withheld during model training 33 | """ 34 | self.fitted = True 35 | 36 | def format_str(self, path_tree, **kwargs): 37 | raise NotImplementedError() 38 | 39 | def format_json(self, path_tree, **kwargs): 40 | raise NotImplementedError() 41 | -------------------------------------------------------------------------------- /revscoring/features/wikitext/datasources/revision_oriented.py: -------------------------------------------------------------------------------- 1 | from revscoring.dependencies import DependentSet 2 | 3 | from . import edit, parsed, sentences, tokenized 4 | 5 | 6 | class BaseRevision(DependentSet): 7 | 8 | def __init__(self, name, revision_datasources): 9 | super().__init__(name) 10 | self.text = revision_datasources.text 11 | 12 | if hasattr(revision_datasources, "parent"): 13 | self.parent = Revision( 14 | name + ".parent", 15 | revision_datasources.parent 16 | ) 17 | 18 | 19 | class Revision(parsed.Revision, sentences.Revision, tokenized.Revision, 20 | BaseRevision): 21 | 22 | def __init__(self, name, revision_datasources): 23 | # Initializes all of the Revision datasources 24 | super().__init__(name, revision_datasources) 25 | 26 | # Initializes the diff using the Revision datasources 27 | if hasattr(revision_datasources, "diff"): 28 | self.diff = Diff(name + ".diff", self) 29 | 30 | 31 | class BaseDiff(DependentSet): 32 | 33 | def __init__(self, name, revision): 34 | super().__init__(name) 35 | self.revision = revision 36 | 37 | 38 | class Diff(edit.Diff, sentences.Diff, tokenized.Diff, BaseDiff): 39 | pass 40 | -------------------------------------------------------------------------------- /revscoring/datasources/meta/indexable.py: -------------------------------------------------------------------------------- 1 | """ 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that 3 | return `list`'s and `tuple`'s 4 | 5 | .. autoclass:: revscoring.datasources.meta.indexable.index 6 | 7 | """ 8 | from ..datasource import Datasource 9 | 10 | 11 | class index(Datasource): 12 | """ 13 | Generates a datasource that returns the value that appears at `i` 14 | 15 | :Parameters: 16 | i : `int` 17 | The index of a value to return 18 | default : `mixed` 19 | The value to return if no value exists at `i`. If not specified, 20 | an IndexError will be raised 21 | name : `str` 22 | A name for the new datasource. 23 | """ 24 | 25 | def __init__(self, i, datasources, default=NotImplemented, name=None): 26 | name = self._format_name(name, [i, default]) 27 | self.i = int(i) 28 | self.default = default 29 | super().__init__(name, self.process, 30 | depends_on=[datasources]) 31 | 32 | def process(self, indexable): 33 | try: 34 | return indexable[self.i] 35 | except IndexError: 36 | if self.default is not NotImplemented: 37 | return self.default 38 | else: 39 | raise 40 | -------------------------------------------------------------------------------- /revscoring/features/bytes/revision_oriented.py: -------------------------------------------------------------------------------- 1 | from revscoring.datasources import revision_oriented 2 | from revscoring.dependencies import DependentSet 3 | 4 | from ..meta import aggregators 5 | from . import datasources 6 | 7 | name = "bytes.revision" 8 | 9 | 10 | class Revision(DependentSet): 11 | 12 | def __init__(self, name, revision_datasources): 13 | super().__init__(name) 14 | self.length = aggregators.len( 15 | revision_datasources.bytes, 16 | name=name + ".length" 17 | ) 18 | "`int` : The length of the revision content in bytes" 19 | 20 | if hasattr(revision_datasources, "parent"): 21 | self.parent = Revision( 22 | name + ".parent", 23 | revision_datasources.parent 24 | ) 25 | """ 26 | :class:`revscoring.features.bytes.Revision` : The 27 | parent (aka "previous") revision of the page. 28 | """ 29 | 30 | 31 | revision = Revision(name, 32 | datasources.Revision(name, revision_oriented.revision)) 33 | """ 34 | Represents the base revision of interest. Implements this a basic structure: 35 | 36 | * revision: :class:`~revscoring.features.bytes.Revision` 37 | * parent: :class:`~revscoring.features.bytes.Revision` 38 | """ 39 | -------------------------------------------------------------------------------- /revscoring/scoring/statistics/classification/label_thresholds.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import OrderedDict 3 | 4 | from ... import util 5 | from ...model_info import ModelInfo 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class LabelThresholds(ModelInfo): 11 | 12 | def lookup(self, path): 13 | if len(path) > 0: 14 | key = path[0] 15 | if len(path[1:]) > 0: 16 | logger.warn("Ignoring path at {0!r}".format(path[1:])) 17 | return self[key] 18 | else: 19 | return self 20 | 21 | def format_str(self, path_tree, **kwargs): 22 | formatted = "thresholds:\n" 23 | for label in path_tree.keys() or self.keys(): 24 | sub_tree = path_tree.get(label, {}) 25 | formatted += util.tab_it_in(repr(label)) 26 | table_str = self[label].format_str(sub_tree, **kwargs) 27 | formatted += util.tab_it_in(table_str, 2) 28 | formatted += "\n" 29 | return formatted 30 | 31 | def format_json(self, path_tree, **kwargs): 32 | doc = OrderedDict 33 | for label in path_tree.keys() or self.keys(): 34 | sub_tree = path_tree.get(label, {}) 35 | doc[label] = self[label].format_json(sub_tree, **kwargs) 36 | return doc 37 | -------------------------------------------------------------------------------- /revscoring/utilities/model_info.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``revscoring model_info -h`` 3 | :: 4 | 5 | Prints formatted information about a model file. 6 | 7 | 8 | Usage: 9 | module_info -h | --help 10 | module_info [...] [--formatting=] 11 | 12 | Options: 13 | -h --help Prints this documentation 14 | Path to a model file 15 | A model information path. If no path is provided, 16 | all default fields will be in the output. 17 | --formatting= What format to output the information? "str" or 18 | "json" [default: str] 19 | """ 20 | import json 21 | 22 | import docopt 23 | 24 | from ..scoring import Model, models 25 | 26 | 27 | def main(argv=None): 28 | args = docopt.docopt(__doc__, argv=argv) 29 | scoring_model = Model.load(models.open_file(args[''])) 30 | paths = args[''] 31 | formatting = args['--formatting'] 32 | 33 | run(scoring_model, paths, formatting) 34 | 35 | 36 | def run(scoring_model, paths, formatting): 37 | formatted = scoring_model.info.format(paths, formatting=formatting) 38 | if formatting == "json": 39 | formatted = json.dumps(formatted, indent=2) 40 | 41 | print(formatted) 42 | -------------------------------------------------------------------------------- /tests/utilities/test_util.py: -------------------------------------------------------------------------------- 1 | from revscoring.utilities.util import (read_labels_and_population_rates, 2 | read_labels_config) 3 | 4 | 5 | def test_plain_labels(): 6 | labels, label_weights, population_rates = read_labels_and_population_rates( 7 | "true,false", ["true=5"], ["true=0.1", "false=0.9"], None) 8 | 9 | assert labels == [True, False] 10 | assert label_weights == {True: 5} 11 | assert population_rates == {True: 0.1, False: 0.9} 12 | 13 | 14 | def test_pop_rates_labels(): 15 | labels, label_weights, population_rates = read_labels_and_population_rates( 16 | None, ["true=5"], ["true=0.1", "false=0.9"], None) 17 | 18 | assert labels == [True, False] 19 | assert label_weights == {True: 5} 20 | assert population_rates == {True: 0.1, False: 0.9} 21 | 22 | 23 | def test_labels_config(): 24 | labels_config = { 25 | 'name': "enwiki damaging", 26 | 'labels': [ 27 | {'value': True, 'weight': 5, 'population_rate': 0.1}, 28 | {'value': False, 'population_rate': 0.9} 29 | ]} 30 | labels, label_weights, population_rates = read_labels_config(labels_config) 31 | 32 | assert labels == [True, False] 33 | assert label_weights == {True: 5} 34 | assert population_rates == {True: 0.1, False: 0.9} 35 | -------------------------------------------------------------------------------- /.github/workflows/publish_python_package.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | paths: 8 | - 'revscoring/about.py' 9 | 10 | jobs: 11 | build-n-publish: 12 | name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@master 16 | - name: Set up Python 3.7 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: "3.7" 20 | - name: Install pypa/build 21 | run: >- 22 | python -m pip install build --user 23 | - name: Build a binary wheel and a source tarball 24 | run: >- 25 | python -m 26 | build 27 | --sdist 28 | --wheel 29 | --outdir dist/ 30 | . 31 | 32 | - name: Publish distribution 📦 to Test PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1 34 | with: 35 | password: ${{ secrets.PYPI_TEST_TOKEN }} 36 | repository_url: https://test.pypi.org/legacy/ 37 | 38 | - name: Publish distribution 📦 to PyPI 39 | uses: pypa/gh-action-pypi-publish@release/v1 40 | with: 41 | user: scoring-internal 42 | password: ${{ secrets.PYPI_PASS }} 43 | -------------------------------------------------------------------------------- /revscoring/languages/features/dictionary/util.py: -------------------------------------------------------------------------------- 1 | import enchant 2 | 3 | REPLACEMENT_CHAR = "\uFFFD" 4 | 5 | 6 | def utf16_cleanup(token): 7 | """ 8 | Removes chars that can't be represented in two bytes. This is important 9 | since `enchant` will expect that all strings passed to it are two-byte 10 | chars and print "This UTF-8 encoding can't convert to UTF-16:" if it can't 11 | decode. This prevents that problem. 12 | See https://github.com/rfk/pyenchant/issues/58 13 | """ 14 | return "".join(c if ord(c) < 2 ** 16 else REPLACEMENT_CHAR 15 | for c in token) 16 | 17 | 18 | def load_dict(dict_name, target_package): 19 | try: 20 | return enchant.Dict(dict_name) 21 | except enchant.errors.DictNotFoundError: 22 | raise ImportError( 23 | ("No enchant-compatible dictionary found for {0!r}. " + 24 | "Consider installing {1!r}").format(dict_name, target_package)) 25 | 26 | 27 | class MultiDictChecker: 28 | """ 29 | Implements a check() method that will iterate through dictionaries looking 30 | for any correct spelling. 31 | """ 32 | 33 | def __init__(self, *dicts): 34 | self.dicts = dicts 35 | 36 | def check(self, word): 37 | for dict in self.dicts: 38 | if dict.check(word): 39 | return True 40 | return False 41 | -------------------------------------------------------------------------------- /revscoring/datasources/meta/hashing.py: -------------------------------------------------------------------------------- 1 | """ 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that returns 3 | a list of strings (i.e. "tokens") and produces a list of ngram/skipgram 4 | sequences. 5 | 6 | .. autoclass:: revscoring.datasources.meta.hashing.hash 7 | 8 | """ 9 | import json 10 | 11 | import mmh3 12 | 13 | from ..datasource import Datasource 14 | 15 | 16 | class hash(Datasource): 17 | """ 18 | Converts a sequence of items into a sequence of portable hashes (`int`) 19 | based on the result of applying `str()`. E.g. `str(["foo"]) = '["foo"]'` 20 | 21 | :Parameters: 22 | items_datasource : :class:`revscoring.Datasource` 23 | A datasource that generates a list of items to be hashed 24 | n : `int` 25 | The number of potential hashes that can be produced 26 | name : `str` 27 | A name for the datasource. 28 | """ 29 | 30 | def __init__(self, items_datasource, n=2 ** 20, name=None): 31 | name = self._format_name(name, [items_datasource, n]) 32 | super().__init__(name, self.process, 33 | depends_on=[items_datasource]) 34 | self.n = n 35 | 36 | def process(self, items): 37 | return [mmh3_item(item, self.n) for item in items] 38 | 39 | 40 | def mmh3_item(item, n): 41 | return (2**32 + mmh3.hash(json.dumps(item))) % n 42 | -------------------------------------------------------------------------------- /revscoring/datasources/meta/gramming.py: -------------------------------------------------------------------------------- 1 | """ 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that returns 3 | a list of strings (i.e. "tokens") and produces a list of ngram/skipgram 4 | sequences. 5 | 6 | .. autoclass:: revscoring.datasources.meta.gramming.gram 7 | 8 | """ 9 | from ..datasource import Datasource 10 | 11 | 12 | class gram(Datasource): 13 | """ 14 | Converts a sequence of items into ngrams. 15 | 16 | :Parameters: 17 | items_datasource : :class:`revscoring.Datasource` 18 | A datasource that generates a list of some item 19 | grams : `list` ( `tuple` ( `int` ) ) 20 | A list of ngram and/or skipgram sequences to produce 21 | name : `str` 22 | A name for the datasource. 23 | """ 24 | 25 | def __init__(self, items_datasource, grams=[(0,)], name=None): 26 | name = self._format_name(name, [items_datasource, grams]) 27 | super().__init__(name, self.process, 28 | depends_on=[items_datasource]) 29 | self.grams = grams 30 | 31 | def process(self, tokens): 32 | return list(gram_tokens(tokens, grams=self.grams)) 33 | 34 | 35 | def gram_tokens(items, grams=[(0,)]): 36 | for i in range(len(items)): 37 | for gram in grams: 38 | if gram == (0,): 39 | yield (items[i], ) 40 | elif len(items) > i + max(gram): 41 | yield tuple(items[i + offset] for offset in gram) 42 | -------------------------------------------------------------------------------- /revscoring/scoring/models/util.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import numpy 4 | 5 | 6 | def normalize(v): 7 | if isinstance(v, numpy.bool_): 8 | return bool(v) 9 | elif isinstance(v, numpy.ndarray): 10 | return [normalize(item) for item in v] 11 | elif v == numpy.NaN: 12 | return "NaN" 13 | elif v == numpy.NINF: 14 | return "-Infinity" 15 | elif v == numpy.PINF: 16 | return "Infinity" 17 | elif isinstance(v, numpy.floating): 18 | return float(v) 19 | elif isinstance(v, tuple): 20 | return list(v) 21 | else: 22 | return v 23 | 24 | 25 | def key_normalize(v): 26 | v = normalize(v) 27 | if isinstance(v, bool) or isinstance(v, int) or isinstance(v, float) or \ 28 | isinstance(v, str): 29 | return v 30 | elif isinstance(v, list) or isinstance(v, dict): 31 | return json.dumps(v) 32 | else: 33 | return str(v) 34 | 35 | 36 | def normalize_json(doc): 37 | if isinstance(doc, dict): 38 | return {key_normalize(k): normalize_json(v) 39 | for k, v in doc.items()} 40 | elif isinstance(doc, list) or isinstance(doc, tuple): 41 | return [normalize_json(v) for v in doc] 42 | else: 43 | return normalize(doc) 44 | 45 | 46 | def format_params(doc): 47 | if doc is None: 48 | return None 49 | else: 50 | return ", ".join("{0}={1}".format(k, json.dumps(v)) 51 | for k, v in doc.items()) 52 | -------------------------------------------------------------------------------- /docs/notes_on_adhoc_jobs.txt: -------------------------------------------------------------------------------- 1 | Given a set of rev_ids. Return the vandal scores. 2 | 3 | $ cat rev_ids.tsv | predict --source=enwiki_api.yaml --scorer=enwiki_svc.yaml > predictions.tsv 4 | 5 | ^^ This imagines a UNIX command line utility that takes a set of rev_ids and 6 | makes predictions 7 | 8 | 9 | Imagine some python (requires `pip install mwreverts`): 10 | 11 | # There exists a model file at enwiki.model 12 | from mwapi import Session 13 | import mwreverts 14 | 15 | from revscoring.extractors.api.extractor import Extractor 16 | from revscoring.scoring.models import LinearSVC 17 | 18 | session = Session("https://en.wikipedia.org/w/api.php") 19 | extractor = Extractor(session) 20 | 21 | model = Model.load(open("enwiki.model", "rb")) 22 | 23 | api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids') 24 | revisions = next(iter(api_result['query']['pages'].values()))['revisions'] 25 | 26 | # Content that has been revision-deleted has a hidden SHA-1 27 | revisions = [revision for revision in revisions if 'sha1hidden' not in revision] 28 | reverted_set = set() 29 | 30 | for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions): 31 | for reverted in revert.reverteds: 32 | reverted_set.add(reverted['sha1']) 33 | 34 | for revision in revisions: 35 | if revision['sha1'] not in reverted_set: # no revert happened 36 | score = model.score([revision['revid']])['probability'][True] 37 | 38 | if score > .5: 39 | print(revision['pagetitle']) 40 | -------------------------------------------------------------------------------- /revscoring/features/wikitext/features/revision_oriented.py: -------------------------------------------------------------------------------- 1 | from revscoring.dependencies import DependentSet 2 | 3 | from . import chars, edit_tokens, parsed, tokenized 4 | 5 | prefix = "wikitext.revision" 6 | 7 | 8 | class BaseRevision(DependentSet): 9 | 10 | def __init__(self, name, revision_datasources): 11 | super().__init__(name) 12 | self.datasources = revision_datasources 13 | 14 | if hasattr(self.datasources, "parent"): 15 | self.parent = Revision( 16 | name + ".parent", 17 | self.datasources.parent 18 | ) 19 | """ 20 | :class:`revscoring.features.wikitext.Revision` : The 21 | parent (aka "previous") revision of the page. 22 | """ 23 | 24 | if hasattr(self.datasources, "diff"): 25 | self.diff = Diff( 26 | name + ".diff", 27 | self.datasources.diff 28 | ) 29 | """ 30 | :class:`~revscoring.features.wikitext.Diff` : The 31 | difference between this revision and the parent revision. 32 | """ 33 | 34 | 35 | class Revision(parsed.Revision, chars.Revision, tokenized.Revision, 36 | BaseRevision): 37 | pass 38 | 39 | 40 | class BaseDiff(DependentSet): 41 | 42 | def __init__(self, name, diff_datasources, *args, **kwargs): 43 | super().__init__(name) 44 | self.datasources = diff_datasources 45 | 46 | 47 | class Diff(chars.Diff, edit_tokens.Diff, tokenized.Diff, BaseDiff): 48 | pass 49 | -------------------------------------------------------------------------------- /revscoring/datasources/meta/dicts.py: -------------------------------------------------------------------------------- 1 | """ 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that 3 | return `dict`'s 4 | 5 | .. autoclass:: revscoring.datasources.meta.dicts.keys 6 | 7 | .. autoclass:: revscoring.datasources.meta.dicts.values 8 | 9 | """ 10 | from ..datasource import Datasource 11 | 12 | 13 | class keys(Datasource): 14 | """ 15 | Generates a set of `dict` keys 16 | 17 | :Parameters: 18 | dict_datasource : :class:`revscoring.Datasource` 19 | A datasource that generates a `dict` 20 | name : `str` 21 | A name for the new datasource. 22 | """ 23 | 24 | def __init__(self, dict_datasource, name=None): 25 | name = self._format_name(name, [dict_datasource]) 26 | super().__init__(name, self.process, 27 | depends_on=[dict_datasource]) 28 | 29 | def process(self, d): 30 | return (d or {}).keys() 31 | 32 | 33 | class values(Datasource): 34 | """ 35 | Generates a list of `dict` values 36 | 37 | :Parameters: 38 | dict_datasource : :class:`revscoring.Datasource` 39 | A datasource that generates a `dict` 40 | name : `str` 41 | A name for the new datasource. 42 | """ 43 | 44 | def __init__(self, dict_datasource, name=None): 45 | name = self._format_name(name, [dict_datasource]) 46 | super().__init__(name, self.process, 47 | depends_on=[dict_datasource]) 48 | 49 | def process(self, d): 50 | return [v for v in (d or {}).values()] 51 | -------------------------------------------------------------------------------- /tests/scoring/statistics/classification/tests/test_micro_macro_stats.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from revscoring.scoring.statistics.classification.micro_macro_stats import \ 4 | MicroMacroStats 5 | from revscoring.scoring.statistics.classification.scaled_prediction_statistics import \ 6 | ScaledPredictionStatistics as SPS # noqa 7 | 8 | 9 | def test_micro_macro_stats(): 10 | # (tp, fp, tn, fn) 11 | stats_keys = ['Short', 'Labels', 'Can', 'Be', 'Columns'] 12 | stats_values = [ 13 | SPS(counts=(10, 2, 5, 8)), 14 | SPS(counts=(9, 3, 9, 4)), 15 | SPS(counts=(11, 1, 8, 5)), 16 | SPS(counts=(10, 2, 9, 4)), 17 | SPS(counts=(5, 7, 3, 10)) 18 | ] 19 | stats = OrderedDict() 20 | for key, value in zip(stats_keys, stats_values): 21 | stats[key] = value 22 | mms = MicroMacroStats(stats, 'precision') 23 | 24 | print(mms.format_str({})) 25 | assert len(mms.format_str({}).split('\n')) <= 5 26 | assert list(stats.keys()) == list(mms['labels'].keys()) 27 | 28 | # (tp, fp, tn, fn) 29 | stats = { 30 | 'A really long label name': SPS(counts=(10, 2, 5, 8)), 31 | 'Another long label name': SPS(counts=(9, 3, 9, 4)), 32 | 'Again we\'re very long': SPS(counts=(11, 1, 8, 5)), 33 | 'We should be too long': SPS(counts=(10, 2, 9, 4)), 34 | 'One more for good measure': SPS(counts=(5, 7, 3, 10)) 35 | } 36 | mms = MicroMacroStats(stats, 'precision') 37 | 38 | print(mms.format_str({})) 39 | assert len(mms.format_str({}).split('\n')) > 5 40 | -------------------------------------------------------------------------------- /revscoring/utilities/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements a set of utilities for extracting features and 3 | train/testing :class:`revscoring.Model` from the command-line. When the 4 | revscoring python package is installed, a `revscoring` utility should be 5 | available from the commandline. Run `revscoring -h` for more 6 | information: 7 | 8 | check_model 9 | +++++++++++ 10 | .. automodule:: revscoring.utilities.check_model 11 | 12 | cv_train 13 | ++++++++ 14 | .. automodule:: revscoring.utilities.cv_train 15 | 16 | dump_cache 17 | ++++++++++ 18 | .. automodule:: revscoring.utilities.dump_cache 19 | 20 | extract 21 | +++++++ 22 | .. automodule:: revscoring.utilities.extract 23 | 24 | fetch_idioms 25 | ++++++++++++ 26 | .. automodule:: revscoring.utilities.fetch_idioms 27 | 28 | fetch_text 29 | ++++++++++ 30 | .. automodule:: revscoring.utilities.fetch_text 31 | 32 | fit 33 | +++ 34 | .. automodule:: revscoring.utilities.fit 35 | 36 | intersect_merge_observations 37 | ++++++++++++++++++++++++++++ 38 | .. automodule:: revscoring.utilities.intersect_merge_observations 39 | 40 | model_info 41 | ++++++++++ 42 | .. automodule:: revscoring.utilities.model_info 43 | 44 | score 45 | +++++ 46 | .. automodule:: revscoring.utilities.score 47 | 48 | test_model 49 | ++++++++++ 50 | .. automodule:: revscoring.utilities.test_model 51 | 52 | tune 53 | ++++ 54 | .. automodule:: revscoring.utilities.tune 55 | 56 | union_merge_observations 57 | ++++++++++++++++++++++++ 58 | .. automodule:: revscoring.utilities.union_merge_observations 59 | 60 | util 61 | ++++ 62 | .. automodule:: revscoring.utilities.util 63 | """ 64 | -------------------------------------------------------------------------------- /tests/dependencies/test_context.py: -------------------------------------------------------------------------------- 1 | 2 | from revscoring.dependencies.context import Context 3 | from revscoring.dependencies.dependent import Dependent 4 | 5 | 6 | def test_context(): 7 | # No context 8 | context = Context() 9 | foo = Dependent("foo", lambda: "foo") 10 | bar = Dependent("bar", lambda: "bar") 11 | foobar = Dependent("foobar", lambda foo, bar: foo + bar, 12 | depends_on=[foo, bar]) 13 | assert context.solve(foobar) == "foobar" 14 | assert list(context.solve([foo, bar, foobar])) == ["foo", "bar", "foobar"] 15 | 16 | # Cache context 17 | context = Context(cache={bar: "baz"}) 18 | assert context.solve(foobar) == "foobaz" 19 | 20 | # Context context 21 | mybar = Dependent("bar", lambda: "baz") 22 | 23 | context = Context(context={mybar}) 24 | assert context.solve(foobar) == "foobaz" 25 | 26 | context = Context(context={mybar: mybar}) 27 | assert context.solve(foobar) == "foobaz" 28 | 29 | context = Context(context={bar: mybar}) 30 | assert context.solve(foobar) == "foobaz" 31 | 32 | context = Context(context={bar: lambda: "baz"}) 33 | assert context.solve(foobar) == "foobaz" 34 | context.update(context={bar: lambda: "buzz"}) 35 | assert context.solve(foobar) == "foobuzz" 36 | 37 | assert set(context.expand([foobar])) == {foo, bar, foobar} 38 | 39 | context.update(context={bar: bar}) 40 | assert set(context.dig([foobar])) == {foo, bar} 41 | 42 | assert (context.draw(foobar) == " - \n" + 43 | "\t - \n" + 44 | "\t - \n") 45 | -------------------------------------------------------------------------------- /revscoring/languages/features/matches/matches.py: -------------------------------------------------------------------------------- 1 | from . import datasources, features 2 | from ....dependencies import DependentSet 3 | from ....features import wikitext 4 | 5 | 6 | class Matches(DependentSet): 7 | def __init__(self, name, matcher, match_list, exclusions=None, 8 | text_preprocess=None): 9 | super().__init__(name) 10 | self._match_list = match_list 11 | self._exclusions = exclusions 12 | self.revision = features.Revision( 13 | name + ".revision", 14 | datasources.Revision( 15 | name + ".revision", matcher, 16 | wikitext.revision.datasources, 17 | text_preprocess=text_preprocess 18 | ) 19 | ) 20 | """ 21 | :class:`~revscoring.languages.features.matches.Revision` : 22 | The base revision feature set. 23 | """ 24 | 25 | def excluding(self, exclusions, name=None): 26 | """ 27 | Returns a new :class:`~revscoring.languages.features.Matches` 28 | that includes a set of exclusions. 29 | 30 | :Parameters: 31 | exclusions : `list` ( `str` ) 32 | A list of terms to explicitly not match 33 | name : `str` 34 | A new name for the collection. If unspecified, the old name 35 | will be used 36 | """ 37 | return self.__class__( 38 | name or self._name + ".excluding({0!r})".format(exclusions), 39 | self._match_list, 40 | exclusions=(self._exclusions or []) + exclusions) 41 | -------------------------------------------------------------------------------- /tests/scoring/models/tests/test_random_forest.py: -------------------------------------------------------------------------------- 1 | from revscoring.scoring.models.model import Model 2 | from revscoring.scoring.models.random_forest import RandomForest 3 | 4 | from .util import (FEATURES, format_info, pickle_and_unpickle, train_test, 5 | train_test_multilabel) 6 | 7 | 8 | def test_random_forest(): 9 | model = RandomForest(FEATURES, [True, False]) 10 | format_info(model) 11 | train_test(model) 12 | reconstructed_model = pickle_and_unpickle(model) 13 | train_test(reconstructed_model) 14 | format_info(model) 15 | 16 | config = { 17 | 'scorer_models': { 18 | 'test': { 19 | 'class': "revscoring.scoring.models.RandomForest", 20 | 'labels': [True, False], 21 | 'features': [1, 2, 3] 22 | } 23 | } 24 | } 25 | model = Model.from_config(config, 'test') 26 | assert isinstance(model, RandomForest) 27 | 28 | 29 | def test_random_forest_multilabel(): 30 | model = RandomForest(FEATURES, ["A", "B", "C"], multilabel=True) 31 | format_info(model) 32 | train_test_multilabel(model) 33 | reconstructed_model = pickle_and_unpickle(model) 34 | train_test_multilabel(reconstructed_model) 35 | format_info(model) 36 | 37 | config = { 38 | 'scorer_models': { 39 | 'test': { 40 | 'class': "revscoring.scoring.models.RandomForest", 41 | 'labels': ["A", "B", "C"], 42 | 'features': [1, 2, 3] 43 | } 44 | } 45 | } 46 | model = Model.from_config(config, 'test') 47 | assert isinstance(model, RandomForest) 48 | -------------------------------------------------------------------------------- /revscoring/scoring/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains a collection of models that implement a simple function: 3 | :func:`~revscoring.Model.score`. Currently, all models are 4 | a subclass of :class:`revscoring.scoring.models.Learned` 5 | which means that they also implement 6 | :meth:`~revscoring.scoring.models.Learned.train` and 7 | :meth:`~revscoring.scoring.models.Learned.cross_validate`. 8 | 9 | Gradient Boosting 10 | +++++++++++++++++ 11 | .. automodule:: revscoring.scoring.models.gradient_boosting 12 | 13 | Naive Bayes 14 | +++++++++++ 15 | .. automodule:: revscoring.scoring.models.naive_bayes 16 | 17 | Linear Regression 18 | +++++++++++++++++ 19 | .. automodule:: revscoring.scoring.models.linear 20 | 21 | Support Vector 22 | ++++++++++++++ 23 | .. automodule:: revscoring.scoring.models.svc 24 | 25 | Random Forest 26 | +++++++++++++ 27 | .. automodule:: revscoring.scoring.models.random_forest 28 | 29 | Abstract classes 30 | ++++++++++++++++ 31 | .. automodule:: revscoring.scoring.models.model 32 | 33 | SciKit Learn-based models 34 | +++++++++++++++++++++++++ 35 | .. automodule:: revscoring.scoring.models.sklearn 36 | 37 | """ 38 | from .gradient_boosting import GradientBoosting 39 | from .linear import LogisticRegression 40 | from .model import Classifier, Learned, open_file 41 | from .naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, NaiveBayes 42 | from .random_forest import RandomForest 43 | from .svc import RBFSVC, SVC, LinearSVC 44 | 45 | __all__ = [ 46 | Learned, Classifier, open_file, 47 | SVC, LinearSVC, RBFSVC, NaiveBayes, GaussianNB, MultinomialNB, BernoulliNB, 48 | RandomForest, GradientBoosting, LogisticRegression 49 | ] 50 | -------------------------------------------------------------------------------- /tests/scoring/models/tests/test_gradient_boosting.py: -------------------------------------------------------------------------------- 1 | from revscoring.scoring.models.gradient_boosting import GradientBoosting 2 | from revscoring.scoring.models.model import Model 3 | 4 | from .util import (FEATURES, format_info, pickle_and_unpickle, train_test, 5 | train_test_multilabel) 6 | 7 | 8 | def test_gradient_boosting(): 9 | model = GradientBoosting(FEATURES, [True, False]) 10 | format_info(model) 11 | train_test(model) 12 | reconstructed_model = pickle_and_unpickle(model) 13 | train_test(reconstructed_model) 14 | format_info(model) 15 | 16 | config = { 17 | 'scorer_models': { 18 | 'test': { 19 | 'class': "revscoring.scoring.models.GradientBoosting", 20 | 'labels': [True, False], 21 | 'features': [1, 2, 3] 22 | } 23 | } 24 | } 25 | model = Model.from_config(config, 'test') 26 | assert isinstance(model, GradientBoosting) 27 | 28 | 29 | def test_gradient_boosting_multilabel(): 30 | model = GradientBoosting(FEATURES, ["A", "B", "C"], multilabel=True) 31 | format_info(model) 32 | train_test_multilabel(model) 33 | reconstructed_model = pickle_and_unpickle(model) 34 | train_test_multilabel(reconstructed_model) 35 | format_info(model) 36 | 37 | config = { 38 | 'scorer_models': { 39 | 'test': { 40 | 'class': "revscoring.scoring.models.GradientBoosting", 41 | 'labels': ["A", "B", "C"], 42 | 'features': [1, 2, 3] 43 | } 44 | } 45 | } 46 | model = Model.from_config(config, 'test') 47 | assert isinstance(model, GradientBoosting) 48 | -------------------------------------------------------------------------------- /revscoring/features/functions.py: -------------------------------------------------------------------------------- 1 | """ 2 | .. autofunction:: revscoring.features.trim 3 | """ 4 | from itertools import chain 5 | 6 | from .feature import Constant, Feature, Modifier 7 | 8 | 9 | def trim(features, context=None): 10 | """ 11 | Trims a feature set down to a bare set of :class:`~revscoring.Feature` by 12 | removing :class:`~revscoring.features.Modifier` and 13 | :class:`~revscoring.features.Constant`. 14 | 15 | :Parameters: 16 | features : `list` ( :class:`revscoring.Feature` ) 17 | A feature list to trim 18 | context : `dict` | `set` 19 | A context to apply while trimming 20 | """ 21 | context = context or {} 22 | cache = set() 23 | 24 | if hasattr(features, "__iter__"): 25 | for feature in features: 26 | yield from _trim(feature, context, cache) 27 | else: 28 | yield from _trim(features, context, cache) 29 | 30 | 31 | def _trim(dependent, context, cache): 32 | if isinstance(dependent, Feature): 33 | feature = dependent 34 | if isinstance(feature, Modifier): 35 | for dependent in feature.dependencies: 36 | yield from _trim(dependent, context, cache) 37 | elif isinstance(feature, Constant): 38 | pass 39 | else: 40 | if feature not in cache: 41 | cache.add(feature) 42 | yield feature 43 | 44 | 45 | def vectorize_values(feature_values): 46 | """ 47 | Converts a list of feature_values that contains sub-FeatureVector 48 | into a flat list of values. 49 | """ 50 | return list(chain(*(val if hasattr(val, "__iter__") else [val] 51 | for val in feature_values))) 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import sys 4 | 5 | from setuptools import find_packages, setup 6 | 7 | about_path = os.path.join(os.path.dirname(__file__), "revscoring/about.py") 8 | exec(compile(open(about_path).read(), about_path, "exec")) 9 | 10 | 11 | if sys.version_info <= (3, 0): 12 | print("Revscoring needs Python 3 to run properly. Your version is " + 13 | platform.python_version()) 14 | sys.exit(1) 15 | 16 | 17 | def read(fname): 18 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 19 | 20 | 21 | def requirements(fname): 22 | return [line.strip() 23 | for line in open(os.path.join(os.path.dirname(__file__), fname))] 24 | 25 | 26 | setup( 27 | python_requires=">=3", 28 | name=__name__, # noqa 29 | version=__version__, # noqa 30 | author=__author__, # noqa 31 | author_email=__author_email__, # noqa 32 | description=__description__, # noqa 33 | url=__url__, # noqa 34 | license=__license__, # noqa 35 | entry_points={ 36 | 'console_scripts': [ 37 | 'revscoring = revscoring.revscoring:main', 38 | ], 39 | }, 40 | packages=find_packages(), 41 | long_description=read('README.md'), 42 | long_description_content_type="text/markdown", 43 | install_requires=requirements("requirements.txt"), 44 | include_package_data=True, 45 | classifiers=[ 46 | "Development Status :: 3 - Alpha", 47 | "Programming Language :: Python", 48 | "Programming Language :: Python :: 3", 49 | "Environment :: Other Environment", 50 | "Intended Audience :: Developers", 51 | "License :: OSI Approved :: MIT License", 52 | "Operating System :: OS Independent" 53 | ], 54 | ) 55 | -------------------------------------------------------------------------------- /tests/languages/test_basque.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.languages import basque 4 | from revscoring.datasources import revision_oriented 5 | from revscoring.dependencies import solve 6 | 7 | # from .util import compare_extraction 8 | 9 | BAD = [ 10 | ] 11 | 12 | INFORMAL = [ 13 | ] 14 | 15 | OTHER = [ 16 | ] 17 | 18 | r_text = revision_oriented.revision.text 19 | 20 | 21 | ''' 22 | @mark.nottravis 23 | def test_badwords(): 24 | compare_extraction(basque.badwords.revision.datasources.matches, 25 | BAD, OTHER) 26 | 27 | assert basque.badwords == pickle.loads(pickle.dumps(basque.badwords)) 28 | 29 | 30 | @mark.nottravis 31 | def test_informals(): 32 | compare_extraction(basque.informals.revision.datasources.matches, 33 | INFORMAL, OTHER) 34 | 35 | assert basque.informals == pickle.loads(pickle.dumps(basque.informals)) 36 | ''' 37 | 38 | 39 | def test_dictionary(): 40 | cache = {r_text: "gizonezko dominadun worngly."} 41 | assert solve(basque.dictionary.revision.datasources.dict_words, 42 | cache=cache) == ['gizonezko'] 43 | assert solve(basque.dictionary.revision.datasources.non_dict_words, 44 | cache=cache) == ["dominadun", "worngly"] 45 | 46 | assert basque.dictionary == pickle.loads(pickle.dumps(basque.dictionary)) 47 | 48 | ''' 49 | @mark.nottravis 50 | def test_stopwords(): 51 | cache = {r_text: "আন চলচ্চিত্র."} 52 | assert (solve(basque.stopwords.revision.datasources.stopwords, cache=cache) == 53 | ["আন"]) 54 | assert (solve(basque.stopwords.revision.datasources.non_stopwords, 55 | cache=cache) == 56 | ['চলচ্চিত্র']) 57 | 58 | assert basque.stopwords == pickle.loads(pickle.dumps(basque.stopwords)) 59 | ''' 60 | -------------------------------------------------------------------------------- /revscoring/languages/features/stemmed/datasources.py: -------------------------------------------------------------------------------- 1 | from ....datasources.meta import frequencies, mappers 2 | from ....dependencies import DependentSet 3 | 4 | 5 | class Revision(DependentSet): 6 | def __init__(self, name, stem_word, wikitext_revision): 7 | super().__init__(name) 8 | 9 | self.stems = mappers.map( 10 | stem_word, wikitext_revision.words, 11 | name=name + ".stems" 12 | ) 13 | 14 | self.stem_frequency = frequencies.table( 15 | self.stems, 16 | name=name + ".stem_frequency" 17 | ) 18 | 19 | if hasattr(wikitext_revision, 'parent'): 20 | self.parent = Revision(name + ".parent", stem_word, 21 | wikitext_revision.parent) 22 | 23 | if hasattr(wikitext_revision, 'diff'): 24 | self.diff = Diff(name + ".diff", stem_word, 25 | wikitext_revision.diff, self) 26 | 27 | 28 | class Diff(DependentSet): 29 | def __init__(self, name, stem_word, wikitext_diff, revision): 30 | super().__init__(name) 31 | 32 | self.stems_added = mappers.map( 33 | stem_word, wikitext_diff.words_added, 34 | name=name + ".stems_added" 35 | ) 36 | self.stems_removed = mappers.map( 37 | stem_word, wikitext_diff.words_removed, 38 | name=name + ".stems_removed" 39 | ) 40 | 41 | self.stem_delta = frequencies.delta( 42 | revision.parent.stem_frequency, 43 | revision.stem_frequency, 44 | name=name + ".stem_delta" 45 | ) 46 | self.stem_prop_delta = frequencies.prop_delta( 47 | revision.parent.stem_frequency, self.stem_delta, 48 | name=name + ".stem_prop_delta" 49 | ) 50 | -------------------------------------------------------------------------------- /revscoring/features/wikibase/util.py: -------------------------------------------------------------------------------- 1 | class DictDiff: 2 | """ 3 | Represents the difference between two dictionaries 4 | """ 5 | __slots__ = ('added', 'removed', 'intersection', 'changed', 'unchanged') 6 | 7 | def __init__(self, added, removed, intersection, changed, unchanged): 8 | self.added = added 9 | """ 10 | `set` ( `mixed` ) : Keys that were added in the new dictionary 11 | """ 12 | 13 | self.removed = removed 14 | """ 15 | `set` ( `mixed` ) : Keys that were removed in the new dictionary 16 | """ 17 | 18 | self.intersection = intersection 19 | """ 20 | `set` ( `mixed` ) : Keys that appear in both dictionaries 21 | """ 22 | 23 | self.changed = changed 24 | """ 25 | `set` ( `mixed` ) : Keys that appear in both dictionaries, but the 26 | values differ 27 | """ 28 | 29 | self.unchanged = unchanged 30 | """ 31 | `set` ( `mixed` ) : Keys that appear in both dictionaries and have 32 | equivalent values 33 | """ 34 | 35 | 36 | def diff_dicts(a, b): 37 | """ 38 | Generates a diff between two dictionaries. 39 | 40 | :Parameters: 41 | a : `dict` 42 | A dict to diff or `None` 43 | b : `dict` 44 | B dict to diff 45 | """ 46 | a = a or {} 47 | added = b.keys() - a.keys() 48 | removed = a.keys() - b.keys() 49 | intersection = a.keys() & b.keys() 50 | 51 | changed = set() 52 | unchanged = set() 53 | for key in intersection: 54 | if a[key] == b[key]: 55 | unchanged.add(key) 56 | else: 57 | changed.add(key) 58 | 59 | return DictDiff(added, removed, intersection, changed, unchanged) 60 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/test_vectorizers.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from unittest.mock import patch 3 | 4 | import pytest 5 | from revscoring.datasources import revision_oriented as ro 6 | from revscoring.datasources.meta import vectorizers 7 | from revscoring.dependencies import solve 8 | from revscoring.features import wikitext 9 | 10 | 11 | class FakeVectors(dict): 12 | pass 13 | 14 | test_vectors = FakeVectors({ 15 | 'a': [1] * 100, 16 | 'b': [2] * 100, 17 | 'c': [3] * 100}) 18 | test_vectors.vector_size = 100 19 | 20 | 21 | def vectorize_words(words): 22 | return vectorizers.word2vec.vectorize_words(test_vectors, words) 23 | 24 | 25 | def test_word2vec(): 26 | wv = vectorizers.word2vec(wikitext.revision.datasources.words, 27 | vectorize_words, name='word vectors') 28 | vector = solve(wv, cache={ro.revision.text: 'a bv c d'}) 29 | assert len(vector) == 2 30 | assert len(vector[0]) == 100 31 | vector = solve(wv, cache={ro.revision.text: ''}) 32 | assert len(vector) == 1 33 | assert len(vector[0]) == 100 34 | 35 | assert pickle.loads(pickle.dumps(wv)) == wv 36 | 37 | 38 | @patch('gensim.models.keyedvectors') 39 | def test_loadkv_path(kv): 40 | kv.KeyedVectors.load_word2vec_format.return_value = test_vectors 41 | vectorizers.KeyedVectors = kv.KeyedVectors 42 | vectors = vectorizers.word2vec.load_word2vec(path='foo') 43 | assert vectors is not None 44 | 45 | 46 | @patch('gensim.models.keyedvectors') 47 | def test_loadkv_filename_none(kv): 48 | kv.KeyedVectors.load_word2vec_format.side_effect = FileNotFoundError 49 | vectorizers.KeyedVectors = kv.KeyedVectors 50 | assert pytest.raises(FileNotFoundError, 51 | vectorizers.word2vec.load_word2vec, filename='foo') 52 | -------------------------------------------------------------------------------- /tests/extractors/test_extractor.py: -------------------------------------------------------------------------------- 1 | 2 | from revscoring.datasources import Datasource, revision_oriented 3 | from revscoring.extractors.extractor import Extractor, OfflineExtractor 4 | 5 | 6 | def get_last_two(id): 7 | return int(str(id)[-2:]) 8 | 9 | 10 | def test_offline_extractor(): 11 | last_two_in_id = Datasource("last_two_in_id", get_last_two, 12 | depends_on=[revision_oriented.revision.id]) 13 | 14 | extractor = OfflineExtractor() 15 | 16 | assert extractor.extract(345678, last_two_in_id) == 78 17 | 18 | assert (list(extractor.extract([345678, 4634800], last_two_in_id)) == 19 | [(None, 78), (None, 0)]) 20 | 21 | extraction_profile = {} 22 | list(extractor.extract([345678, 4634800], last_two_in_id, 23 | profile=extraction_profile)) 24 | assert len(extraction_profile) == 1 25 | assert len(extraction_profile[last_two_in_id]) == 2 26 | 27 | 28 | def test_from_config(): 29 | config = { 30 | 'extractors': { 31 | 'enwiki': { 32 | 'class': "revscoring.extractors.api.Extractor", 33 | 'host': "https://en.wikipedia.org", 34 | 'api_path': "/w/api.php", 35 | 'timeout': 20, 36 | 'user_agent': "revscoring tests" 37 | }, 38 | 'offline': { 39 | 'class': "revscoring.extractors.OfflineExtractor" 40 | } 41 | } 42 | } 43 | Extractor.from_config(config, 'enwiki') 44 | Extractor.from_config(config, 'offline') 45 | 46 | config = { 47 | 'extractors': { 48 | 'offline': { 49 | 'module': "revscoring.extractors.OfflineExtractor", 50 | } 51 | } 52 | } 53 | assert Extractor.from_config(config, 'offline') == OfflineExtractor 54 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/test_extractors.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources.datasource import Datasource 4 | from revscoring.datasources.meta import extractors 5 | from revscoring.dependencies import solve 6 | 7 | 8 | def return_foo(): 9 | return "foo" 10 | 11 | 12 | segments = Datasource("segments") 13 | 14 | text = Datasource("text") 15 | 16 | text_extractor = extractors.regex(["foo bar", "bar foo"], text, 17 | name="text_extractor") 18 | 19 | exclusion_text_extractor = extractors.regex(["foo+"], text, 20 | name="text_extractor", 21 | exclusions=['foooo']) 22 | 23 | segment_extractor = extractors.regex(["foo bar", "bar foo"], segments, 24 | name="text_extractor") 25 | 26 | 27 | def test_text_extractor(): 28 | cache = {text: "This is some text foo bar nope bar foo"} 29 | assert solve(text_extractor, cache=cache) == ["foo bar", "bar foo"] 30 | cache = {text: None} 31 | assert solve(text_extractor, cache=cache) == [] 32 | 33 | assert pickle.loads(pickle.dumps(text_extractor)) == text_extractor 34 | 35 | 36 | def test_exclusion_text_extractor(): 37 | cache = {text: "This is some text foooo bar nope bar foo fooo"} 38 | assert solve(exclusion_text_extractor, cache=cache) == ["foo", "fooo"] 39 | 40 | assert (pickle.loads(pickle.dumps(exclusion_text_extractor)) == 41 | exclusion_text_extractor) 42 | 43 | 44 | def test_segment_extractor(): 45 | cache = {segments: ["This is some text foo bar nope bar foo", "foo bar", 46 | "foo"]} 47 | assert (solve(segment_extractor, cache=cache) == 48 | ["foo bar", "bar foo", "foo bar"]) 49 | 50 | assert pickle.loads(pickle.dumps(segment_extractor)) == segment_extractor 51 | -------------------------------------------------------------------------------- /revscoring/languages/korean.py: -------------------------------------------------------------------------------- 1 | from .features import RegexMatches 2 | 3 | name = "korean" 4 | 5 | # No useful dictionary. hunspell-ko is broken on Ubuntu 14.10 6 | 7 | # No korean stopwords 8 | 9 | # No stemmer 10 | 11 | 12 | badword_regexes = [ 13 | r'ㅂㅅ', 14 | r'ㅅㅂ', 15 | r'ㅆㅂ', 16 | r'ㅈㄹ', 17 | r'간나', 18 | r'갈보', 19 | r'개(기[다지]|년|끼|소?리|수작|새끼|자식|좆|차반)', 20 | r'걸레년', 21 | r'계집년', 22 | r'그지새끼', 23 | r'꼴값', 24 | r'눈깔', 25 | r'느금마', 26 | r'대가리', r'대갈빡', 27 | r'뒈져라', r'뒤져', r'뒤져라', r'디져라', 28 | r'또라이', 29 | r'띠발', 30 | r'미친놈', 31 | r'버러지년', 32 | r'병시나', r'병신', 33 | r'븅신', 34 | r'빌어먹을', 35 | r'빙신', 36 | r'빡대갈', 37 | r'뻐큐', 38 | r'색히', 39 | r'시부랄', 40 | r'쌍년', r'쌍놈', 41 | r'썅', r'썅년', r'썅놈', 42 | r'쓰레기같은', r'쓰벌', 43 | r'씨바', r'씨발', r'씨발년', r'씨발놈', 44 | r'씹구멍', r'씹물', r'씹버러지', r'씹빨', r'씹새', r'씹알', r'씹창', 45 | r'아가리', 46 | r'애자', 47 | r'앰창', r'엠창', 48 | r'염병', r'옘병', 49 | r'잡년', 50 | r'조빱', 51 | r'존나', 52 | r'좆같', r'좆까', r'좆나', r'좆만한', r'좆밥', r'좆빠는', r'좆뺑이', r'좆씹', 53 | r'지랄', 54 | r'찌질이', 55 | r'찐따', 56 | r'창년', 57 | r'처먹다', r'쳐먹다', 58 | r'호로자식', 59 | r'화냥', 60 | r'후레' 61 | ] 62 | 63 | badwords = RegexMatches(name + ".badwords", badword_regexes) 64 | """ 65 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 66 | badword detecting regexes. 67 | """ 68 | 69 | informal_regexes = [ 70 | r"아니오" 71 | r"잠시만요" 72 | r"합니다만", r"입니다만", 73 | r"\w*니다", r"\w+니까", 74 | r"\w*세요", 75 | r"\w*데요", 76 | r"\w*지요", 77 | r"\w*네요", 78 | r"\w*어요", 79 | r"\w*하죠" 80 | ] 81 | 82 | informals = RegexMatches(name + ".informals", informal_regexes) 83 | """ 84 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 85 | informal word detecting regexes. 86 | """ 87 | -------------------------------------------------------------------------------- /revscoring/scoring/statistics/classification/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classification statistics can be generated for "Classifiers" -- models 3 | that produce factors (aka levels) as an ouput. E.g. True and False or 4 | "A", "B", or "C". 5 | 6 | .. autoclass:: revscoring.scoring.statistics.Classification 7 | :members: 8 | :member-order: 9 | 10 | .. autoclass:: revscoring.scoring.statistics.classification.Counts 11 | :members: 12 | :member-order: 13 | 14 | .. autoclass:: revscoring.scoring.statistics.classification.Rates 15 | :members: 16 | :member-order: 17 | 18 | .. autoclass:: revscoring.scoring.statistics.classification.MicroMacroStats 19 | :members: 20 | :member-order: 21 | 22 | .. autoclass:: revscoring.scoring.statistics.classification.ScaledPredictionStatistics 23 | :members: 24 | :member-order: 25 | 26 | .. autoclass:: revscoring.scoring.statistics.classification.ScaledThresholdStatistics 27 | :members: 28 | :member-order: 29 | 30 | .. autoclass:: revscoring.scoring.statistics.classification.ScaledClassificationMatrix 31 | :members: 32 | :member-order: 33 | 34 | .. autoclass:: revscoring.scoring.statistics.classification.ThresholdOptimization 35 | :members: 36 | :member-order: 37 | """ # noqa 38 | from .classification import Classification 39 | from .counts import Counts 40 | from .micro_macro_stats import MicroMacroStats 41 | from .rates import Rates 42 | from .scaled_classification_matrix import ScaledClassificationMatrix 43 | from .scaled_prediction_statistics import ScaledPredictionStatistics 44 | from .scaled_threshold_statistics import ScaledThresholdStatistics 45 | from .threshold_optimization import ThresholdOptimization 46 | 47 | __all__ = [Classification, Counts, Rates, MicroMacroStats, 48 | ScaledPredictionStatistics, ScaledThresholdStatistics, 49 | ScaledClassificationMatrix, ThresholdOptimization] 50 | -------------------------------------------------------------------------------- /revscoring/extractors/api/util.py: -------------------------------------------------------------------------------- 1 | from ...datasources import Datasource 2 | 3 | REV_PROPS = {'ids', 'user', 'timestamp', 'userid', 'comment', 'size', 4 | 'contentmodel'} 5 | USER_PROPS = {'groups', 'editcount', 'gender', 'registration'} 6 | 7 | 8 | def identity(v): 9 | return v 10 | 11 | 12 | class key(Datasource): 13 | 14 | def __init__(self, keys, dict_datasource, name=None, if_missing=None, 15 | apply=None): 16 | self.keys = keys 17 | self.if_missing = if_missing 18 | self.apply = apply or identity 19 | if name is None: 20 | name = "{0}[{1}]".format(dict_datasource.name, repr(keys)) 21 | 22 | super().__init__(name, self.process, depends_on=[dict_datasource]) 23 | 24 | def process(self, d): 25 | if d is None: 26 | # Special case for when no doc could be found and that is OK 27 | return None 28 | 29 | try: 30 | value = _lookup_keys(self.keys, d) 31 | except KeyError: 32 | if self.if_missing is not None: 33 | Exc, args = self.if_missing[0], self.if_missing[1:] 34 | raise Exc(*args) 35 | else: 36 | return None 37 | 38 | return self.apply(value) 39 | 40 | 41 | class key_exists(Datasource): 42 | 43 | def __init__(self, key, dict_datasource, name=None): 44 | self.key = key 45 | if name is None: 46 | name = "{1} in {0}".format(dict_datasource.name, repr(key)) 47 | 48 | super().__init__(name, self.process, depends_on=[dict_datasource]) 49 | 50 | def process(self, d): 51 | return self.key in d 52 | 53 | 54 | def _lookup_keys(keys, d): 55 | if isinstance(keys, str) or not hasattr(keys, "__iter__"): 56 | keys = [keys] 57 | try: 58 | for key in keys: 59 | d = d[key] 60 | except KeyError: 61 | raise KeyError(keys) 62 | return d 63 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/test_filters.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import re 3 | 4 | from revscoring.datasources.datasource import Datasource 5 | from revscoring.datasources.meta import filters 6 | from revscoring.dependencies import solve 7 | 8 | tokens = Datasource("tokens") 9 | 10 | foo_tokens = filters.regex_matching("foo", tokens, name="foo_tokens") 11 | foo_case_tokens = filters.regex_matching(re.compile("foo"), tokens, 12 | name="foo_case_tokens") 13 | 14 | my_ints = Datasource("my_ints") 15 | 16 | positive_ints = filters.positive(my_ints) 17 | negative_ints = filters.negative(my_ints) 18 | 19 | not_none_tokens = filters.not_none(tokens, name="not_none_tokens") 20 | 21 | 22 | def test_regex_matching(): 23 | cache = {tokens: ["foo", "bar", "FOO"]} 24 | assert (solve(foo_tokens, cache=cache) == 25 | ["foo", "FOO"]) 26 | 27 | assert (solve(foo_case_tokens, cache=cache) == 28 | ["foo"]) 29 | 30 | assert pickle.loads(pickle.dumps(foo_tokens)) == foo_tokens 31 | assert pickle.loads(pickle.dumps(foo_case_tokens)) == foo_case_tokens 32 | 33 | assert foo_tokens != foo_case_tokens 34 | 35 | 36 | def test_positive(): 37 | cache = {my_ints: [1, 0, -1]} 38 | assert (solve(positive_ints, cache=cache) == 39 | [1]) 40 | assert pickle.loads(pickle.dumps(positive_ints)) == positive_ints 41 | 42 | assert positive_ints != negative_ints 43 | 44 | 45 | def test_negative(): 46 | cache = {my_ints: [1, 0, -1]} 47 | assert (solve(negative_ints, cache=cache) == 48 | [-1]) 49 | assert pickle.loads(pickle.dumps(negative_ints)) == negative_ints 50 | 51 | assert negative_ints != positive_ints 52 | 53 | 54 | def test_not_none(): 55 | cache = {tokens: ["foo", None, 1]} 56 | assert (solve(not_none_tokens, cache=cache) == 57 | ["foo", 1]) 58 | 59 | assert pickle.loads(pickle.dumps(not_none_tokens)) == not_none_tokens 60 | -------------------------------------------------------------------------------- /tests/extractors/api/tests/test_util.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from pytest import raises 4 | 5 | from revscoring.datasources import Datasource 6 | from revscoring.dependencies import solve 7 | from revscoring.dependencies.util import or_none 8 | from revscoring.extractors.api.util import _lookup_keys, key, key_exists 9 | 10 | 11 | def test_lookup_keys(): 12 | 13 | assert _lookup_keys("foo", {'foo': 1}) == 1 14 | assert _lookup_keys(["foo", "bar"], {'foo': {'bar': 1}}) == 1 15 | 16 | 17 | def test_key(): 18 | my_dict = Datasource("my_dict") 19 | foo = key('foo', my_dict) 20 | assert solve(foo, cache={my_dict: {'foo': "bar"}}) == 'bar' 21 | assert repr(foo) == "" 22 | 23 | bar = key('bar', my_dict, apply=or_none(int)) 24 | assert solve(bar, cache={my_dict: {'bar': None}}) is None 25 | assert solve(bar, cache={my_dict: {'bar': "1"}}) == 1 26 | 27 | foobar = key(['foo', 'bar'], my_dict) 28 | assert solve(foobar, cache={my_dict: {'bar': 1}}) is None 29 | assert solve(foobar, cache={my_dict: {'foo': {'bar': 1}}}) == 1 30 | assert repr(foobar) == "" 31 | 32 | assert pickle.loads(pickle.dumps(foo)) == foo 33 | assert pickle.loads(pickle.dumps(bar)) == bar 34 | assert pickle.loads(pickle.dumps(foobar)) == foobar 35 | 36 | 37 | def test_missing_key(): 38 | with raises(RuntimeError): 39 | my_dict = Datasource("my_dict") 40 | foobar = key(['foo', 'bar'], my_dict, 41 | if_missing=(RuntimeError)) 42 | assert solve(foobar, cache={my_dict: {'bar': 1}}) is None 43 | 44 | 45 | def test_key_exists(): 46 | my_dict = Datasource("my_dict") 47 | foo_exists = key_exists('foo', my_dict) 48 | assert solve(foo_exists, cache={my_dict: {'foo': "bar"}}) is True 49 | assert solve(foo_exists, cache={my_dict: {'baz': "bar"}}) is False 50 | assert pickle.loads(pickle.dumps(foo_exists)) == foo_exists 51 | -------------------------------------------------------------------------------- /tests/datasources/meta/tests/test_mappers.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources.datasource import Datasource 4 | from revscoring.datasources.meta import mappers 5 | from revscoring.dependencies import solve 6 | 7 | tokens = Datasource("tokens") 8 | my_ints = Datasource("my_ints") 9 | 10 | 11 | def extract_first_char(token): 12 | return token[:1] 13 | 14 | 15 | first_char = mappers.map(extract_first_char, tokens, name="first_char") 16 | 17 | lower_case_tokens = mappers.lower_case(tokens, name="lower_case_tokens") 18 | 19 | derepeat_tokens = mappers.derepeat(tokens, name="derepeat_tokens") 20 | 21 | de1337_tokens = mappers.de1337(tokens, name="de1337_tokens") 22 | 23 | abs_ints = mappers.abs(my_ints) 24 | 25 | 26 | def test_item_mapper(): 27 | cache = {tokens: ["alpha", "bravo", "charlie", "delta"]} 28 | assert (solve(first_char, cache=cache) == 29 | ["a", "b", "c", "d"]) 30 | 31 | assert pickle.loads(pickle.dumps(first_char)) == first_char 32 | 33 | 34 | def test_lower_case(): 35 | cache = {tokens: ["foo", "Bar", "FOO", "İ"]} 36 | assert (solve(lower_case_tokens, cache=cache) == 37 | ["foo", "bar", "foo", "i"]) 38 | 39 | assert pickle.loads(pickle.dumps(lower_case_tokens)) == lower_case_tokens 40 | 41 | 42 | def test_derepeat(): 43 | cache = {tokens: ["foo", "Bar", "FOO"]} 44 | assert (solve(derepeat_tokens, cache=cache) == 45 | ["fo", "Bar", "FO"]) 46 | 47 | assert pickle.loads(pickle.dumps(derepeat_tokens)) == derepeat_tokens 48 | 49 | 50 | def test_de1337(): 51 | cache = {tokens: ["1337", "W4ff1e"]} 52 | assert (solve(de1337_tokens, cache=cache) == 53 | ["leet", "Waffle"]) 54 | 55 | assert pickle.loads(pickle.dumps(de1337_tokens)) == de1337_tokens 56 | 57 | 58 | def test_abs(): 59 | cache = {my_ints: [1, 0, -1]} 60 | assert (solve(abs_ints, cache=cache) == 61 | [1, 0, 1]) 62 | 63 | assert pickle.loads(pickle.dumps(abs_ints)) == abs_ints 64 | -------------------------------------------------------------------------------- /tests/languages/features/matches/test_substrings.py: -------------------------------------------------------------------------------- 1 | from revscoring.datasources import revision_oriented 2 | from revscoring.dependencies import solve 3 | from revscoring.languages.features import SubstringMatches 4 | 5 | substrings = SubstringMatches( 6 | "english.idioms", 7 | ["blood", "bonding", "friends"], 8 | ) 9 | 10 | r_text = revision_oriented.revision.text 11 | p_text = revision_oriented.revision.parent.text 12 | 13 | 14 | def test_regexes(): 15 | cache = {p_text: "Blood runs in the family.", 16 | r_text: "Bonding connects friends and family."} 17 | 18 | assert (solve(substrings.revision.datasources.matches, cache=cache) == 19 | ['bonding', 'friends']) 20 | 21 | assert solve(substrings.revision.matches, cache=cache) == 2 22 | 23 | assert (solve(substrings.revision.parent.datasources.matches, cache=cache) == 24 | ['blood']) 25 | 26 | assert solve(substrings.revision.parent.matches, cache=cache) == 1 27 | 28 | diff = substrings.revision.diff 29 | 30 | assert solve(diff.datasources.matches_added, 31 | cache=cache) == ['bonding', 'friends'] 32 | 33 | assert solve(diff.matches_added, cache=cache) == 2.0 34 | 35 | assert solve(diff.datasources.matches_removed, 36 | cache=cache) == ['blood'] 37 | 38 | assert solve(diff.matches_removed, cache=cache) == 1.0 39 | 40 | assert (solve(diff.datasources.match_delta, cache=cache) == 41 | {'bonding': 1, 'friends': 1, 'blood': -1}) 42 | 43 | pd = solve(diff.datasources.match_prop_delta, cache=cache) 44 | assert pd.keys() == {'bonding', 'friends', 'blood'} 45 | 46 | assert round(pd['bonding'], 2) == 1.0 47 | assert round(pd['friends'], 2) == 1.0 48 | assert round(pd['blood'], 2) == -1.0 49 | 50 | assert round(solve(diff.match_delta_sum, cache=cache), 2) == 1 51 | assert round(solve(diff.match_prop_delta_sum, cache=cache), 2) == 1.0 52 | -------------------------------------------------------------------------------- /tests/extractors/api/tests/test_datasources.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import mwapi 4 | 5 | from revscoring.datasources import revision_oriented as ro 6 | from revscoring.extractors.api.datasources import (LastUserRevDoc, 7 | PageCreationRevDoc, 8 | PropertySuggestionDoc, 9 | RevDocById, UserInfoDoc) 10 | from revscoring.extractors.api.extractor import Extractor 11 | 12 | 13 | def test_rev_doc_by_id(): 14 | extractor = Extractor(mwapi.Session("foobar")) 15 | rev_doc_by_id = RevDocById(ro.revision, extractor) 16 | 17 | hash(rev_doc_by_id) 18 | assert pickle.loads(pickle.dumps(rev_doc_by_id)) == rev_doc_by_id 19 | 20 | 21 | def test_page_creation_rev_doc(): 22 | extractor = Extractor(mwapi.Session("foobar")) 23 | page_creation_rev_doc = PageCreationRevDoc(ro.revision.page, extractor) 24 | 25 | hash(page_creation_rev_doc) 26 | assert (pickle.loads(pickle.dumps(page_creation_rev_doc)) == 27 | page_creation_rev_doc) 28 | 29 | 30 | def test_property_suggestion_doc(): 31 | extractor = Extractor(mwapi.Session("foobar")) 32 | property_suggestion_doc = PropertySuggestionDoc(ro.revision.page, extractor) 33 | 34 | hash(property_suggestion_doc) 35 | assert (pickle.loads(pickle.dumps(property_suggestion_doc)) == 36 | property_suggestion_doc) 37 | 38 | 39 | def test_user_info_doc(): 40 | extractor = Extractor(mwapi.Session("foobar")) 41 | user_info_doc = UserInfoDoc(ro.revision.user, extractor) 42 | 43 | hash(user_info_doc) 44 | assert (pickle.loads(pickle.dumps(user_info_doc)) == 45 | user_info_doc) 46 | 47 | 48 | def test_last_user_rev_doc(): 49 | extractor = Extractor(mwapi.Session("foobar")) 50 | last_user_rev_doc = LastUserRevDoc(ro.revision, extractor) 51 | 52 | hash(last_user_rev_doc) 53 | assert (pickle.loads(pickle.dumps(last_user_rev_doc)) == 54 | last_user_rev_doc) 55 | -------------------------------------------------------------------------------- /revscoring/languages/japanese.py: -------------------------------------------------------------------------------- 1 | from .features import RegexMatches 2 | 3 | name = "japanese" 4 | 5 | # Copied from https://gist.github.com/whym/b5ac3feb2a78797c9d98 6 | # Yusuke Matsubara (CCO) 7 | badword_regexes = [ 8 | r"死ね", 9 | r"しね", 10 | r"シネ", 11 | r"あほ", 12 | r"アホ", 13 | r"ばか", 14 | r"バカ", 15 | r"やりまん", 16 | r"ヤリマン", 17 | r"まんこ", 18 | r"マンコ", 19 | r"うんこ", 20 | r"ウンコ", 21 | r"きもい", 22 | r"キモイ", 23 | r"痴女", 24 | r"淫乱", 25 | r"在日", 26 | r"チョン", 27 | r"支那", 28 | r"うざい", 29 | r"うぜー", 30 | r"ww+", 31 | r"ww+" 32 | ] 33 | 34 | badwords = RegexMatches(name + ".badwords", badword_regexes, 35 | wrapping=False) 36 | """ 37 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 38 | badword detecting regexes. 39 | """ 40 | 41 | # Copied from https://gist.github.com/whym/b5ac3feb2a78797c9d98 42 | # Yusuke Matsubara (CCO) 43 | informal_regexes = [ 44 | # Words 45 | r"\(笑\)", 46 | r"\(笑\)", 47 | r"・・・+", 48 | r"お願いします", 49 | r"こんにちは", 50 | r"はじめまして", 51 | r"ありがとうございます", 52 | r"ありがとうございました", 53 | r"すみません", 54 | r"思います", 55 | r"はい", 56 | r"いいえ", 57 | r"ですが", 58 | r"あなた", 59 | r"おっしゃる", 60 | # Patterns 61 | r"ね。", 62 | r"な。", 63 | r"よ。", 64 | r"わ。", 65 | r"が。", 66 | r"は。", 67 | r"に。", 68 | r"か?", 69 | r"んか。", 70 | r"すか。", 71 | r"ます。", 72 | r"せん。", 73 | r"です。", 74 | r"ました。", 75 | r"でした。", 76 | r"しょう。", 77 | r"しょうか。", 78 | r"ください。", 79 | r"下さい。", 80 | r"ますが", 81 | r"ですが", 82 | r"ましたが", 83 | r"でしたが", 84 | r"さん、", 85 | r"様、", 86 | r"ちゃい", 87 | r"ちゃう", 88 | r"ちゃえ", 89 | r"ちゃっ", 90 | r"っちゃ", 91 | r"じゃない", 92 | r"じゃなく" 93 | ] 94 | 95 | informals = RegexMatches(name + ".informals", informal_regexes, 96 | wrapping=False) 97 | """ 98 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 99 | informal word detecting regexes. 100 | """ 101 | -------------------------------------------------------------------------------- /revscoring/features/wikitext/datasources/sentences.py: -------------------------------------------------------------------------------- 1 | from deltas.segmenters import MatchableSegment 2 | 3 | from revscoring.datasources import Datasource 4 | from revscoring.datasources.meta import indexable 5 | 6 | 7 | class Revision: 8 | 9 | def __init__(self, name, revision_datasources): 10 | super().__init__(name, revision_datasources) 11 | 12 | self.sentences = Datasource( 13 | self._name + ".sentences", psw2sentences, 14 | depends_on=[self.paragraphs_sentences_and_whitespace] 15 | ) 16 | """ 17 | A list of "sentences" extracted from the text. 18 | """ 19 | 20 | 21 | class Diff(): 22 | 23 | def __init__(self, *args, **kwargs): 24 | super().__init__(*args, **kwargs) 25 | 26 | self.sentences_added_removed = Datasource( 27 | self._name + ".sentences_added_removed", set_diff, 28 | depends_on=[self.revision.sentences, 29 | self.revision.parent.sentences] 30 | ) 31 | 32 | self.sentences_added = indexable.index( 33 | 0, self.sentences_added_removed, 34 | name=self._name + ".sentences_added" 35 | ) 36 | """ 37 | A set of sentences that were added in this edit 38 | """ 39 | 40 | self.sentences_removed = indexable.index( 41 | 1, self.sentences_added_removed, 42 | name=self._name + ".sentences_removed" 43 | ) 44 | """ 45 | A set of sentences that were removed in this edit 46 | """ 47 | 48 | 49 | def psw2sentences(segments): 50 | sentences = [] 51 | for paragraph_or_whitespace in segments: 52 | if isinstance(paragraph_or_whitespace, MatchableSegment): 53 | paragraph = paragraph_or_whitespace # We have a paragraph 54 | for sentence_or_whitespace in paragraph: 55 | if isinstance(sentence_or_whitespace, MatchableSegment): 56 | sentence = sentence_or_whitespace # We have a sentence 57 | sentences.append(sentence) 58 | return sentences 59 | 60 | 61 | def set_diff(a, b): 62 | a, b = set(a), set(b) 63 | return (a - b, b - a) 64 | -------------------------------------------------------------------------------- /tests/dependencies/test_dependent.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from pytest import raises 4 | 5 | from revscoring.dependencies.dependent import Dependent, DependentSet 6 | 7 | 8 | def test_dependent(): 9 | 10 | foobar1 = Dependent("foobar", lambda: "foobar1") 11 | foobar2 = Dependent("foobar", lambda: "foobar2") 12 | 13 | assert foobar1 == foobar2 14 | assert foobar1 != "foo" 15 | 16 | assert hash(foobar1) == hash(foobar2) 17 | 18 | assert foobar1 in {foobar2} 19 | 20 | 21 | def test_name_type(): 22 | with raises(TypeError): 23 | Dependent(5) # Name can't be number 24 | 25 | 26 | def test_format_name(): 27 | foobar1 = Dependent("foobar") 28 | assert foobar1._format_name(None, []) == "Dependent()" 29 | 30 | 31 | def test_dependent_set(): 32 | my_dependents = DependentSet("my_dependents") 33 | c = Dependent('c') 34 | d = Dependent('d') 35 | e = Dependent('e') 36 | my_dependents.c = c 37 | 38 | assert my_dependents == my_dependents 39 | assert my_dependents != "foo" 40 | assert len(my_dependents) == 1 41 | assert set(my_dependents) == {c} 42 | 43 | assert c in my_dependents 44 | assert d not in my_dependents, set(my_dependents) 45 | 46 | my_dependents.d = d 47 | 48 | assert my_dependents & {d} == {d} 49 | assert my_dependents & {e} == set() 50 | assert my_dependents | {e} == {c, d, e} 51 | assert my_dependents - {c} == {d} 52 | 53 | my_sub_dependents = DependentSet("my_sub_dependents") 54 | f = Dependent('f') 55 | my_sub_dependents.f = f 56 | my_dependents.sub = my_sub_dependents 57 | 58 | assert my_sub_dependents.f in my_dependents 59 | 60 | assert set(my_dependents) == {c, d, f} 61 | assert my_dependents & {d} == {d} 62 | assert my_dependents & {f} == {f} 63 | assert my_dependents | {e} == {c, d, e, f} 64 | assert my_dependents - {f} == {c, d} 65 | 66 | assert pickle.loads(pickle.dumps(my_dependents)) == my_dependents 67 | 68 | 69 | def test_duplicate_feature_warning(): 70 | my_dependents = DependentSet("my_dependents") 71 | my_dependents.c = Dependent('c') # Same! 72 | my_dependents.d = Dependent('d') 73 | my_dependents.e = Dependent('c') # Same! 74 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: linux 2 | dist: xenial 3 | group: stable 4 | language: python 5 | python: 6 | # Track Python version on future production machines, Debian Stretch. 7 | - 3.5 8 | - 3.7 9 | - 3.8 10 | sudo: required 11 | addons: 12 | apt: 13 | packages: 14 | - g++ 15 | - gfortran 16 | - libblas-dev 17 | - liblapack-dev 18 | - libopenblas-dev 19 | - python3-dev 20 | - enchant 21 | - aspell-ar 22 | - aspell-bn 23 | - aspell-el 24 | - aspell-id 25 | - aspell-is 26 | - aspell-pl 27 | - aspell-ro 28 | - aspell-sv 29 | - aspell-ta 30 | - aspell-uk 31 | - myspell-cs 32 | - myspell-de-at 33 | - myspell-de-ch 34 | - myspell-de-de 35 | - myspell-es 36 | - myspell-et 37 | - myspell-fa 38 | - myspell-fr 39 | - myspell-he 40 | - myspell-hr 41 | - myspell-hu 42 | - myspell-lv 43 | - myspell-nb 44 | - myspell-nl 45 | - myspell-pt-pt 46 | - myspell-pt-br 47 | - myspell-ru 48 | - myspell-hr 49 | - hunspell-bs 50 | - hunspell-ca 51 | - hunspell-en-au 52 | - hunspell-en-us 53 | - hunspell-en-gb 54 | - hunspell-eu 55 | - hunspell-gl 56 | - hunspell-it 57 | - hunspell-hi 58 | - hunspell-sr 59 | - hunspell-vi 60 | - voikko-fi 61 | 62 | before_install: {} 63 | install: 64 | - pip install -r requirements.txt 65 | - pip install -r docs/requirements.txt 66 | - python -m nltk.downloader stopwords 67 | - pip install -r test-requirements.txt 68 | - pip install twine 69 | script: 70 | - flake8 . --max-line-length=85 --exclude=.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg,docs 71 | - pytest --cov=revscoring -m "not nottravis" 72 | - sphinx-build -anW -b html docs dist/docs 73 | - sphinx-build -b linkcheck docs dist/docs 74 | after_success: 75 | - codecov 76 | notifications: 77 | irc: 78 | channels: 79 | - "chat.freenode.net#wikimedia-ai" 80 | on_success: change 81 | on_failure: change 82 | template: 83 | - "%{repository_slug}#%{build_number} (%{branch} - %{commit} : %{author}): %{message} %{build_url}" 84 | 85 | #deploy: 86 | # provider: script 87 | # script: bash scripts/deploy.sh 88 | # on: 89 | # branch: master 90 | -------------------------------------------------------------------------------- /tests/scoring/models/tests/test_naive_bayes.py: -------------------------------------------------------------------------------- 1 | from revscoring.scoring.models.model import Model 2 | from revscoring.scoring.models.naive_bayes import (BernoulliNB, GaussianNB, 3 | MultinomialNB) 4 | 5 | from .util import FEATURES, format_info, pickle_and_unpickle, train_test 6 | 7 | 8 | def test_gaussian_nb(): 9 | model = GaussianNB(FEATURES, [True, False]) 10 | format_info(model) 11 | train_test(model) 12 | reconstructed_model = pickle_and_unpickle(model) 13 | train_test(reconstructed_model) 14 | format_info(model) 15 | 16 | config = { 17 | 'scorer_models': { 18 | 'test': { 19 | 'class': "revscoring.scoring.models.GaussianNB", 20 | 'labels': [True, False], 21 | 'features': [1, 2, 3] 22 | } 23 | } 24 | } 25 | model = Model.from_config(config, 'test') 26 | assert isinstance(model, GaussianNB) 27 | 28 | 29 | def test_multinomial_nb(): 30 | model = MultinomialNB(FEATURES, [True, False]) 31 | format_info(model) 32 | 33 | # Fails due to negative feature values. 34 | # train_score(model) 35 | # pickle_and_unpickle(model) 36 | 37 | config = { 38 | 'scorer_models': { 39 | 'test': { 40 | 'class': "revscoring.scoring.models.MultinomialNB", 41 | 'labels': [True, False], 42 | 'features': [1, 2, 3] 43 | } 44 | } 45 | } 46 | model = Model.from_config(config, 'test') 47 | assert isinstance(model, MultinomialNB) 48 | 49 | 50 | def test_bernoulli_nb(): 51 | model = BernoulliNB(FEATURES, [True, False]) 52 | format_info(model) 53 | train_test(model) 54 | reconstructed_model = pickle_and_unpickle(model) 55 | train_test(reconstructed_model) 56 | format_info(model) 57 | 58 | config = { 59 | 'scorer_models': { 60 | 'test': { 61 | 'class': "revscoring.scoring.models.BernoulliNB", 62 | 'labels': [True, False], 63 | 'features': [1, 2, 3] 64 | } 65 | } 66 | } 67 | model = Model.from_config(config, 'test') 68 | assert isinstance(model, BernoulliNB) 69 | -------------------------------------------------------------------------------- /revscoring/features/meta/vectorizers.py: -------------------------------------------------------------------------------- 1 | """ 2 | These Meta-Features genetate a :class:`revscoring.FeatureVector` based on some 3 | :class:`revscoring.Datasource`. 4 | 5 | .. autoclass revscoring.features.meta.vectorizers.vectorize 6 | """ 7 | from ..feature_vector import FeatureVector 8 | 9 | 10 | class vectorize(FeatureVector): 11 | """ 12 | Constructs a :class:`revscoring.FeatureVector` that converts a 13 | dictionary into a list of values with a predictable order based on a set of 14 | keys. 15 | 16 | :Parameters: 17 | dict_datasource : :class:`revscoring.Datasource` 18 | A datasource that returns a dictionary of values. If the 19 | datasource implements a `keys()` method, that will be used for 20 | selecting keys to vectorize 21 | keys : `iterable` ( `hashable` ) 22 | A collection of keys to be vectorized from the dictionary. If 23 | specified, this will override the `keys()` method on the 24 | `dict_datasource` 25 | returns : `func` 26 | A function that represents the type of value that will be 27 | contained in the vector. When called without an argument, this 28 | function should return the default value (for missing) keys 29 | in the dict. 30 | name : `str` 31 | A name for the `revscoring.FeatureVector` 32 | """ 33 | 34 | def __init__(self, dict_datasource, keys=None, returns=None, name=None): 35 | if keys is None: 36 | if hasattr(dict_datasource, "keys"): 37 | keys = dict_datasource.keys() 38 | else: 39 | raise AttributeError( 40 | "{0} does not have a keys() ".format(dict_datasource) + 41 | "method and `keys` argument was not specified") 42 | 43 | self.keys = sorted(keys) if keys is not None else None 44 | name = self._format_name(name, [dict_datasource, self.keys[:10]]) 45 | super().__init__(name, self.process, depends_on=[dict_datasource], 46 | returns=returns) 47 | # Sorting keys so that output is deterministic 48 | 49 | def process(self, d): 50 | 51 | return [(d[key] if key in d else self.returns()) for key in self.keys] 52 | -------------------------------------------------------------------------------- /tests/languages/test_hebrew.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.datasources import revision_oriented 4 | from revscoring.dependencies import solve 5 | from revscoring.languages import hebrew 6 | 7 | from .util import compare_extraction 8 | 9 | BAD = [ 10 | "שרמוטה" 11 | ] 12 | 13 | INFORMAL = [ 14 | "בגללך" # Because of you 15 | ] 16 | 17 | OTHER = [ 18 | "בגלל", "חתול" 19 | ] 20 | 21 | r_text = revision_oriented.revision.text 22 | 23 | 24 | def test_badwords(): 25 | compare_extraction(hebrew.badwords.revision.datasources.matches, 26 | BAD, OTHER) 27 | 28 | assert hebrew.badwords == pickle.loads(pickle.dumps(hebrew.badwords)) 29 | 30 | 31 | def test_informals(): 32 | compare_extraction(hebrew.informals.revision.datasources.matches, 33 | INFORMAL, OTHER) 34 | 35 | assert hebrew.informals == pickle.loads(pickle.dumps(hebrew.informals)) 36 | 37 | 38 | def test_dictionary(): 39 | cache = {r_text: "סוויפט גדלה בוויומיסינג, לנאשוויל"} 40 | assert (solve(hebrew.dictionary.revision.datasources.dict_words, cache=cache) == 41 | ['גדלה']) 42 | assert (solve(hebrew.dictionary.revision.datasources.non_dict_words, 43 | cache=cache) == 44 | ['סוויפט', 'בוויומיסינג', 'לנאשוויל']) 45 | 46 | assert hebrew.dictionary == pickle.loads(pickle.dumps(hebrew.dictionary)) 47 | 48 | 49 | """ TODO: 50 | def test_stopwords(): 51 | cache = {r_text: "סוויפט גדלה בוויומיסינג, פנסילבניה, לנאשוויל"} 52 | assert_equal(solve(hebrew.stopwords.revision.datasources.stopwprds, cache=cache), 53 | ["סוויפט", "גדלה", "בוויומיסינג", "פנסילבניה", "לנאשוויל"]) 54 | assert_equal(solve(hebrew.stopwords.revision.datasources.non_stopwords, 55 | cache=cache), 56 | ["סוויפט", "גדלה", "בוויומיסינג", "פנסילבניה", "לנאשוויל"]) 57 | 58 | assert_equal(hebrew.stopwords, pickle.loads(pickle.dumps(hebrew.stopwords))) 59 | 60 | 61 | def test_stemmed(): 62 | cache = {r_text: "סוויפט גדלה בוויומיסינג, פנסילבניה, לנאשוויל"} 63 | assert_equal(solve(hebrew.stemmed.revision.datasources.stems, cache=cache), 64 | ["סוויפט", "גדלה", "בוויומיסינג", "פנסילבניה", "לנאשוויל"]) 65 | 66 | assert_equal(hebrew.stemmed, pickle.loads(pickle.dumps(hebrew.stemmed))) 67 | """ 68 | -------------------------------------------------------------------------------- /revscoring/utilities/intersect_merge_observations.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``revscoring intersect_merge_observations -h`` 3 | :: 4 | 5 | Intersect observation data. Fields will be merged. Data is triaged 6 | according to the order of filenames in the commandline arguments, 7 | with the later files taking preference over earlier. 8 | 9 | Usage: 10 | intersect_merge_observations -h | --help 11 | intersect_merge_observations ... 12 | [--output=] 13 | [--id-column=] 14 | 15 | Options: 16 | List of input file paths 17 | --output= Path to write out the merged observations 18 | [default: ] 19 | --id-column= Name of the id column for deduplication. 20 | [default: rev_id] 21 | """ 22 | 23 | import sys 24 | 25 | import deep_merge 26 | import docopt 27 | 28 | from .util import dump_observation, read_observations 29 | 30 | 31 | def main(argv=None): 32 | """Parse commandline parameters, read files and write merged data. 33 | """ 34 | args = docopt.docopt(__doc__, argv=argv) 35 | 36 | if args['--output'] == "": 37 | out_file = sys.stdout 38 | else: 39 | out_file = open(args['--output'], "w") 40 | 41 | observation_sets = (read_observations(open(path, "r")) 42 | for path in args['']) 43 | 44 | intersected_observations = intersect_merge_observations( 45 | observation_sets, id_column=args['--id-column']) 46 | 47 | for ob in intersected_observations: 48 | dump_observation(ob, out_file) 49 | 50 | 51 | def intersect_merge_observations(observation_sets, id_column): 52 | """Intersect all observations, returning the output as an iterable. 53 | """ 54 | observation_maps = [ 55 | {ob[id_column]: ob for ob in observation_set} 56 | for observation_set in observation_sets] 57 | 58 | for id_ in observation_maps[0]: 59 | # Key exists in all sets 60 | if sum(id_ in om for om in observation_maps) == len(observation_maps): 61 | new_ob = {} 62 | for observation_map in observation_maps: 63 | new_ob = deep_merge.merge(new_ob, observation_map[id_]) 64 | 65 | yield new_ob 66 | -------------------------------------------------------------------------------- /tests/scoring/models/tests/test_svc.py: -------------------------------------------------------------------------------- 1 | from revscoring.scoring.models.model import Model 2 | from revscoring.scoring.models.svc import RBFSVC, SVC, LinearSVC 3 | 4 | from .util import FEATURES, format_info, pickle_and_unpickle, train_test 5 | 6 | 7 | def test_svc(): 8 | model = SVC(FEATURES, [True, False]) 9 | format_info(model) 10 | train_test(model) 11 | reconstructed_model = pickle_and_unpickle(model) 12 | train_test(reconstructed_model) 13 | format_info(model) 14 | 15 | model = SVC(FEATURES, [True, False], scale=True, center=True) 16 | format_info(model) 17 | train_test(model) 18 | reconstructed_model = pickle_and_unpickle(model) 19 | train_test(reconstructed_model) 20 | format_info(model) 21 | 22 | config = { 23 | 'scorer_models': { 24 | 'test': { 25 | 'class': "revscoring.scoring.models.SVC", 26 | 'labels': [True, False], 27 | 'features': [1, 2, 3] 28 | } 29 | } 30 | } 31 | model = Model.from_config(config, 'test') 32 | assert isinstance(model, SVC) 33 | 34 | 35 | def test_linear_svc(): 36 | model = LinearSVC(FEATURES, [True, False]) 37 | format_info(model) 38 | train_test(model) 39 | reconstructed_model = pickle_and_unpickle(model) 40 | train_test(reconstructed_model) 41 | format_info(model) 42 | 43 | config = { 44 | 'scorer_models': { 45 | 'test': { 46 | 'class': "revscoring.scoring.models.LinearSVC", 47 | 'labels': [True, False], 48 | 'features': [1, 2, 3] 49 | } 50 | } 51 | } 52 | model = Model.from_config(config, 'test') 53 | assert isinstance(model, LinearSVC) 54 | 55 | 56 | def test_rbf_svc(): 57 | model = RBFSVC(FEATURES, [True, False]) 58 | format_info(model) 59 | train_test(model) 60 | reconstructed_model = pickle_and_unpickle(model) 61 | train_test(reconstructed_model) 62 | format_info(model) 63 | 64 | config = { 65 | 'scorer_models': { 66 | 'test': { 67 | 'class': "revscoring.scoring.models.RBFSVC", 68 | 'labels': [True, False], 69 | 'features': [1, 2, 3] 70 | } 71 | } 72 | } 73 | model = Model.from_config(config, 'test') 74 | assert isinstance(model, RBFSVC) 75 | -------------------------------------------------------------------------------- /tests/features/meta/tests/test_bools.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import re 3 | 4 | from revscoring.datasources import Datasource 5 | from revscoring.dependencies import solve 6 | from revscoring.features.meta import bools 7 | 8 | my_item = Datasource("my_item") 9 | 10 | my_set = Datasource("my_set") 11 | 12 | my_string = Datasource("my_string") 13 | 14 | 15 | def test_regex_match(): 16 | starts_with_t = bools.regex_match(r"^t", my_string) 17 | 18 | assert (solve(starts_with_t, cache={my_string: "Foo"}) is 19 | False) 20 | assert (solve(starts_with_t, cache={my_string: "too"}) is 21 | True) 22 | assert (solve(starts_with_t, cache={my_string: "Too"}) is 23 | True) 24 | 25 | assert pickle.loads(pickle.dumps(starts_with_t)) == starts_with_t 26 | 27 | starts_with_lower_t = bools.regex_match(re.compile(r"^t"), my_string) 28 | 29 | assert (solve(starts_with_lower_t, cache={my_string: "Foo"}) is 30 | False) 31 | assert (solve(starts_with_lower_t, cache={my_string: "too"}) is 32 | True) 33 | assert (solve(starts_with_lower_t, cache={my_string: "Too"}) is 34 | False) 35 | 36 | assert pickle.loads(pickle.dumps(starts_with_lower_t) 37 | ) == starts_with_lower_t 38 | 39 | 40 | def test_item_in_set(): 41 | is_a_sysop = bools.item_in_set('sysop', my_set) 42 | 43 | assert solve(is_a_sysop, cache={my_set: {'foo', 'bar'}}) is False 44 | assert solve(is_a_sysop, cache={my_set: {'foo', 'sysop'}}) is True 45 | assert solve(is_a_sysop, cache={my_set: None}) is False 46 | 47 | assert pickle.loads(pickle.dumps(is_a_sysop)) == is_a_sysop 48 | 49 | 50 | def test_set_contains_item(): 51 | is_me = bools.set_contains_item({6877667}, my_item) 52 | 53 | assert solve(is_me, cache={my_item: 999}) is False 54 | assert solve(is_me, cache={my_item: 6877667}) is True 55 | assert solve(is_me, cache={my_item: None}) is False 56 | 57 | assert pickle.loads(pickle.dumps(is_me)) == is_me 58 | 59 | 60 | def test_sets_intersect(): 61 | has_small_odd = bools.sets_intersect({1, 2, 3, 5, 7, 9, 11, 13}, my_set) 62 | 63 | assert solve(has_small_odd, cache={my_set: {4, 18, 10}}) is False 64 | assert solve(has_small_odd, cache={my_set: {20, 10, 3, 5, 1}}) is True 65 | assert solve(has_small_odd, cache={my_set: None}) is False 66 | 67 | assert pickle.loads(pickle.dumps(has_small_odd)) == has_small_odd 68 | -------------------------------------------------------------------------------- /revscoring/utilities/union_merge_observations.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``revscoring union_merge_observations -h`` 3 | :: 4 | 5 | Merge labeled revisions, taking the union of values for any rows with 6 | the same id. Data is triaged according to the order of filenames in the 7 | commandline arguments, with the later files taking preference over earlier. 8 | Behavior is not specified if an input file has duplicate revisions within 9 | it. 10 | 11 | FIXME: Reading everything into memory is reckless. Estimate where this 12 | hits a wall. 13 | 14 | Usage: 15 | union_merge_observations -h | --help 16 | union_merge_observations ... 17 | [--output=] 18 | [--id-column=] 19 | 20 | Options: 21 | List of input file paths 22 | --output= Path to write out the merged observations 23 | [default: ] 24 | --id-column= Name of the id field for deduplication. 25 | [default: rev_id] 26 | """ 27 | 28 | import collections 29 | import itertools 30 | import sys 31 | 32 | import deep_merge 33 | import docopt 34 | 35 | from .util import dump_observation, read_observations 36 | 37 | 38 | def main(argv=None): 39 | """Parse commandline parameters, read files and write merged data. 40 | """ 41 | args = docopt.docopt(__doc__, argv=argv) 42 | 43 | if args['--output'] == "": 44 | out_file = sys.stdout 45 | else: 46 | out_file = open(args['--output'], "w") 47 | 48 | observation_chunks = (read_observations(open(path, "r")) 49 | for path in args['']) 50 | all_observations = itertools.chain(*observation_chunks) 51 | 52 | merged_observations = union_merge_observations( 53 | all_observations, id_column=args['--id-column']) 54 | for ob in merged_observations: 55 | dump_observation(ob, out_file) 56 | 57 | 58 | def union_merge_observations(observations, id_column): 59 | """Merge all observations, returning the output as a list. 60 | """ 61 | id_map = collections.defaultdict(dict) 62 | for ob in observations: 63 | # Get the id value. 64 | ob_id = ob[id_column] 65 | 66 | # Merge the contents, with later entries taking precedence when keys 67 | # match. 68 | id_map[ob_id] = deep_merge.merge(id_map[ob_id], ob) 69 | 70 | return id_map.values() 71 | -------------------------------------------------------------------------------- /revscoring/features/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements a set of :class:`revscoring.Feature` 3 | for use in scoring revisions. :class:`revscoring.Feature` 4 | lists can be provided to a :func:`revscoring.dependencies.solve`, or 5 | more commonly, to a :class:`revscoring.Extractor` to obtain simple 6 | numerical/boolean values that can be used when modeling revision 7 | scores. The provided features are split conceptually into a set of modules: 8 | 9 | Feature collections 10 | +++++++++++++++++++ 11 | 12 | :mod:`~revscoring.features.revision_oriented` 13 | Basic features of revisions. E.g. ``revision.user.text_matches(r'.*Bot')`` 14 | :mod:`~revscoring.features.bytes` 15 | Features of the number of bytes of content, byte length of characters, 16 | etc. 17 | :mod:`~revscoring.features.temporal` 18 | Features of the time between events of a interest. E.g. 19 | ``revision.user.last_revision.seconds_since`` 20 | :mod:`~revscoring.features.wikibase` 21 | Features of wikibase items and changes made to them. E.g. 22 | ``revision.diff.property_changed('P31')`` 23 | :mod:`~revscoring.features.wikitext` 24 | Features of wikitext content and differences between revisions. E.g. 25 | ``revision.diff.uppercase_words_added`` 26 | 27 | Functions 28 | +++++++++ 29 | 30 | .. automodule:: revscoring.features.functions 31 | 32 | Meta-features 33 | +++++++++++++ 34 | Meta-Features are classes that extend :class:`~revscoring.Feature` and 35 | implement common operations on :class:`~revscoring.Datasource` like 36 | :class:`~revscoring.features.meta.aggregators.sum` and 37 | :class:`~revscoring.features.meta.bools.item_in_set`. See 38 | :mod:`revscoring.features.meta` for the full list. 39 | 40 | Modifiers 41 | +++++++++ 42 | Modifiers are functions that can be applied to a :class:`revscoring.Feature` 43 | to modify the value. E.g. :class:`~revscoring.features.modifiers.log`, 44 | :class:`~revscoring.features.modifiers.max` and 45 | :class:`~revscoring.features.modifiers.add`. 46 | See :mod:`~revscoring.features.modifiers` for the full list. 47 | 48 | Base classes 49 | ++++++++++++ 50 | 51 | .. automodule:: revscoring.features.feature 52 | 53 | .. automodule:: revscoring.features.feature_vector 54 | """ 55 | 56 | from .feature import Constant, Feature, Modifier 57 | from .feature_vector import FeatureVector 58 | from .functions import trim, vectorize_values 59 | 60 | __all__ = [Feature, Modifier, Constant, FeatureVector, trim, vectorize_values] 61 | -------------------------------------------------------------------------------- /revscoring/revscoring.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provides access to a set of utilities for working with revision scorer models. 3 | 4 | Utilities: 5 | 6 | * check_model Compares a models construction environment snapshot to the 7 | current environment 8 | * cv_train Cross-validates, and then trains a Model with extracted features 9 | * dump_cache Reads a cache file and dumps out a set of features, target 10 | label, and (optionally) score documents in a TSV file. 11 | * extract Extracts a cache of dependencies for a set of observations 12 | * fetch_idioms Fetches a list of English idioms from English Wiktionary 13 | * fetch_text Fetches text for a set of observations 14 | * fit Fits a dependent to observed data 15 | * intersect_merge_observations Intersect observation data 16 | * model_info Reads a model-file and reports metadata and testing 17 | statistics 18 | * score Scores a set of revisions using a trained model 19 | * test_model Tests an MLScorerModel with extracted features 20 | * tune Tunes a set of models against a training set to identify 21 | the best model/configuration 22 | * union_merge_observations Merge labeled revisions, taking the union of 23 | values for any rows with the same id 24 | 25 | Usage: 26 | revscoring (-h | --help) 27 | revscoring [-h|--help] 28 | """ # noqa 29 | 30 | import sys 31 | import traceback 32 | from importlib import import_module 33 | 34 | 35 | USAGE = """Usage: 36 | revscoring (-h | --help) 37 | revscoring [-h|--help]\n""" 38 | 39 | 40 | def main(): 41 | if len(sys.argv) < 2: 42 | sys.stderr.write(USAGE) 43 | sys.exit(1) 44 | elif sys.argv[1] in ("-h", "--help"): 45 | sys.stderr.write(__doc__ + "\n") 46 | sys.exit(1) 47 | elif sys.argv[1][:1] == "-": 48 | sys.stderr.write(USAGE) 49 | sys.exit(1) 50 | 51 | module_name = sys.argv[1] 52 | try: 53 | module = import_module(".utilities." + module_name, 54 | package="revscoring") 55 | except ImportError: 56 | sys.stderr.write(traceback.format_exc()) 57 | sys.stderr.write("Could not find utility {0}.\n".format(module_name)) 58 | sys.exit(1) 59 | 60 | module.main(sys.argv[2:]) 61 | 62 | 63 | if __name__ == "__main__": 64 | main() 65 | -------------------------------------------------------------------------------- /tests/utilities/test_union_merge_observations.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path 3 | import tempfile 4 | 5 | from revscoring.utilities.union_merge_observations import ( 6 | main, union_merge_observations) 7 | 8 | 9 | def test_union_merge(): 10 | """Merge and inspect results. 11 | """ 12 | observations = [ 13 | {"rev_id": 101, "goodfaith": False, "damaging": True}, 14 | {"rev_id": 101, "goodfaith": True, "damaging": True}, 15 | {"rev_id": 102, "goodfaith": False, "damaging": False}, 16 | ] 17 | expected = [ 18 | {"rev_id": 101, "goodfaith": True, "damaging": True}, 19 | {"rev_id": 102, "goodfaith": False, "damaging": False}, 20 | ] 21 | result = union_merge_observations(observations, "rev_id") 22 | assert expected == list(result) 23 | 24 | 25 | def test_union_cli(): 26 | """Integration test to parse and run from a commandline. This tests file 27 | loading. 28 | """ 29 | # Get test fixtures. 30 | data_dir = os.path.dirname(__file__) + "/data" 31 | in_files = [ 32 | data_dir + "/labeled_revisions.json", 33 | data_dir + "/labeled_foo.json", 34 | ] 35 | 36 | # Receive the output in a temporary file. 37 | (fh, out_file) = tempfile.mkstemp() 38 | 39 | # Do the union. 40 | argv = in_files + ["--output", out_file] 41 | main(argv) 42 | 43 | # Get results. 44 | with open(out_file, "r") as f: 45 | out_text = f.read() 46 | 47 | # Clean up test file :( It would be better if a context manager could 48 | # ensure we don't abort before cleaning up. 49 | os.unlink(out_file) 50 | 51 | # Split result into lines. 52 | lines = out_text.strip().split("\n") 53 | 54 | # There should be six records. 55 | assert len(lines) == 6 56 | 57 | # If this counter is set to 1, it tells us that our row of interest was 58 | # present and not duplicated. 59 | count_merged = 0 60 | 61 | # Spot check a couple o' lines. 62 | for line in lines: 63 | obj = json.loads(line) 64 | if obj["rev_id"] == "16124458": 65 | assert obj == {"damaging": 0, "goodfaith": 1, "approved": 1, 66 | "rev_id": "16124458"} 67 | if obj["rev_id"] == "16124390": 68 | assert obj == {"damaging": 0, "approved": 1, "goodfaith": 0, 69 | "foo": 1, "rev_id": "16124390"} 70 | count_merged += 1 71 | 72 | assert count_merged == 1 73 | -------------------------------------------------------------------------------- /revscoring/languages/finnish.py: -------------------------------------------------------------------------------- 1 | from .features import RegexMatches, Stopwords 2 | 3 | name = "finnish" 4 | 5 | # No dictionary 6 | 7 | # No stemmer 8 | 9 | try: 10 | from nltk.corpus import stopwords as nltk_stopwords 11 | stopwords = set(nltk_stopwords.words('finnish')) 12 | except LookupError: 13 | raise ImportError("Could not load stopwords for {0}. ".format(__name__) + 14 | "You may need to install the nltk 'stopwords' " + 15 | "corpora. See http://www.nltk.org/data.html") 16 | 17 | stopwords = Stopwords(name + ".stopwords", stopwords) 18 | """ 19 | :class:`~revscoring.languages.features.Stopwords` features provided by 20 | `nltk.corpus.stopwords `_ "finnish" 21 | """ 22 | 23 | badword_regexes = [ 24 | r"homo", 25 | r"homoja", 26 | r"homot", 27 | r"hintti", 28 | r"homppeli", 29 | r"huora", 30 | r"idiootti", 31 | r"jumalauta", 32 | r"juntti", 33 | r"kakka", 34 | r"kakkaa", 35 | r"kikkeli", 36 | r"kyrpä", 37 | r"kulli", 38 | r"kusi", 39 | r"kusipää", 40 | r"läski", 41 | r"mamu", 42 | r"matu", 43 | r"neekeri", 44 | r"nussii", 45 | r"narttu", 46 | r"paska", 47 | r"paskaa", 48 | r"paskat", 49 | r"paskin", 50 | r"paskova", 51 | r"pelle", 52 | r"perse", 53 | r"perseeseen", 54 | r"perseessä", 55 | r"perseestä", 56 | r"perseenreikä", 57 | r"perkele", 58 | r"pillu", 59 | r"pilluun", 60 | r"pippeli", 61 | r"pieru", 62 | r"retardi", 63 | r"runkkari", 64 | r"saatana", 65 | r"saatanan", 66 | r"tyhmä", 67 | r"vammane", 68 | r"vammanen", 69 | r"vittu", 70 | r"vitun", 71 | r"äpärä" 72 | ] 73 | 74 | badwords = RegexMatches(name + ".badwords", badword_regexes) 75 | """ 76 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 77 | badword detecting regexes. 78 | """ 79 | 80 | informal_regexes = [ 81 | r"haistakaa", 82 | r"imekää", 83 | r"lol", 84 | r"ootte", 85 | r"moi", 86 | r"hei", 87 | r"sinä", 88 | r"sä", 89 | r"minä", 90 | r"mää", 91 | r"ok", 92 | r"joo", 93 | r"okei" 94 | ] 95 | 96 | informals = RegexMatches(name + ".informals", informal_regexes) 97 | """ 98 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 99 | informal word detecting regexes. 100 | """ 101 | -------------------------------------------------------------------------------- /tests/languages/test_japanese.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from revscoring.languages import japanese 4 | 5 | from .util import compare_extraction 6 | 7 | BAD = [ 8 | "死ね", 9 | "しね", 10 | "シネ", 11 | "あほ", 12 | "アホ", 13 | "ばか", 14 | "バカ", 15 | "やりまん", 16 | "ヤリマン", 17 | "まんこ", 18 | "マンコ", 19 | "うんこ", 20 | "ウンコ", 21 | "きもい", 22 | "キモイ", 23 | "痴女", 24 | "淫乱", 25 | "在日", 26 | "チョン", 27 | "支那", 28 | "うざい", 29 | "うぜー", 30 | "wwww", 31 | "wwww", 32 | "wwwwwwww", 33 | "wwwwwwwwwwwwwww" 34 | ] 35 | 36 | INFORMAL = [ 37 | # Words 38 | "(笑)", 39 | "(笑)", 40 | "・・・", 41 | "お願いします", 42 | "こんにちは", 43 | "はじめまして", 44 | "ありがとうございます", 45 | "ありがとうございました", 46 | "すみません", 47 | "思います", 48 | "はい", 49 | "いいえ", 50 | "ですが", 51 | "あなた", 52 | "おっしゃる", 53 | 54 | # sub-word patterns 55 | "ね。", 56 | "な。", 57 | "よ。", 58 | "わ。", 59 | "が。", 60 | "は。", 61 | "に。", 62 | "か?", 63 | "んか。", 64 | "すか。", 65 | "ます。", 66 | "せん。", 67 | "です。", 68 | "ました。", 69 | "でした。", 70 | "しょう。", 71 | "しょうか。", 72 | "ください。", 73 | "下さい。", 74 | "ますが", 75 | "ですが", 76 | "ましたが", 77 | "でしたが", 78 | "さん、", 79 | "様、", 80 | "ちゃい", 81 | "ちゃう", 82 | "ちゃえ", 83 | "ちゃっ", 84 | "っちゃ", 85 | "じゃない", 86 | "じゃなく" 87 | ] 88 | 89 | OTHER = [ 90 | """ 91 | 本項で解説する地方病とは、山梨県における日本住血吸虫症の呼称であり、 92 | 長い間その原因が明らかにならず住民を苦しめた感染症である。ここでは、 93 | その克服・撲滅に至る歴史について説明する。 94 | この疾患は住血吸虫類に分類される寄生虫である日本住血吸虫の寄生によって発症する寄生虫病であり、 95 | ヒトを含む哺乳類全般の血管内部に寄生感染する人獣共通感染症でもある。 96 | 97 | 病名および原虫に日本の国名が冠されているのは、 98 | 疾患の原因となる病原体(日本住血吸虫)の生体が、 99 | 世界で最初に日本国内(現:山梨県甲府市)で発見されたことによるものであって、 100 | 日本固有の疾患というわけではない。日本住血吸虫症は、中国、フィリピン、 101 | インドネシアの3カ国を中心に、 102 | 年間数千人から数万人規模の新規感染患者が発生しており、 103 | 世界保健機関 (WHO)などによって、さまざまな対策が行われている。 104 | """ 105 | ] 106 | 107 | 108 | def test_badwords(): 109 | compare_extraction(japanese.badwords.revision.datasources.matches, 110 | BAD, OTHER) 111 | 112 | assert japanese.badwords == pickle.loads(pickle.dumps(japanese.badwords)) 113 | 114 | 115 | def test_informals(): 116 | compare_extraction(japanese.informals.revision.datasources.matches, 117 | INFORMAL, OTHER) 118 | 119 | assert japanese.informals == pickle.loads(pickle.dumps(japanese.informals)) 120 | -------------------------------------------------------------------------------- /revscoring/scoring/statistics/classification/rates.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import OrderedDict 3 | 4 | from tabulate import tabulate 5 | 6 | from ... import util 7 | from ...model_info import ModelInfo 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | MAX_COLUMNS_WIDTH_CHARS = 80 12 | 13 | 14 | class Rates(ModelInfo): 15 | 16 | def __init__(self, counts, population_rates=None): 17 | super().__init__() 18 | self['sample'] = OrderedDict( 19 | (label, lcount / counts['n']) 20 | for label, lcount in counts['labels'].items()) 21 | if population_rates: 22 | self['population'] = OrderedDict( 23 | (label, population_rates[label]) for label in counts['labels']) 24 | 25 | def format_str(self, path_tree, ndigits=3, **kwargs): 26 | if len(path_tree) > 0: 27 | logger.warn("Ignoring path_tree={0!r} while formatting rates." 28 | .format(path_tree)) 29 | 30 | formatted = "rates:\n" 31 | table_str = self.format_table(ndigits) 32 | formatted += util.tab_it_in(table_str) 33 | return formatted 34 | 35 | def format_json(self, path_tree, ndigits=3, **kwargs): 36 | doc = OrderedDict() 37 | for key in path_tree or self.keys(): 38 | sub_tree = path_tree.get(key, {}) 39 | if len(sub_tree) > 0: 40 | logger.warn("Ignoring path_tree={0!r} while formatting rates." 41 | .format(sub_tree)) 42 | group = self[key] 43 | doc[key] = {l: util.round(group[l], ndigits) for l in group} 44 | return doc 45 | 46 | def format_table(self, ndigits): 47 | column_header_width = sum(max(len(str(l)) + 2, ndigits + 4) 48 | for l in self['sample']) 49 | if column_header_width < MAX_COLUMNS_WIDTH_CHARS: 50 | return self.format_column_major_table(ndigits) 51 | else: 52 | return self.format_row_major_table(ndigits) 53 | 54 | def format_column_major_table(self, ndigits): 55 | return tabulate( 56 | [[group] + [util.round(self[group].get(label), ndigits) 57 | for label in self['sample']] 58 | for group in self], 59 | headers=[''] + [repr(l) for l in self['sample']]) 60 | 61 | def format_row_major_table(self, ndigits): 62 | return tabulate( 63 | [([label] + 64 | [util.round(self[group][label], ndigits) for group in self]) 65 | for label in self['sample']], 66 | headers=[''] + list(self.keys())) 67 | -------------------------------------------------------------------------------- /revscoring/languages/vietnamese.py: -------------------------------------------------------------------------------- 1 | from .features import Dictionary, RegexMatches, Stopwords 2 | 3 | name = "vietnamese" 4 | 5 | try: 6 | import enchant 7 | dictionary = enchant.Dict("vi") 8 | except enchant.errors.DictNotFoundError: 9 | raise ImportError("No enchant-compatible dictionary found for 'vi'. " + 10 | "Consider installing 'hunspell-vi'.") 11 | 12 | dictionary = Dictionary(name + ".dictionary", dictionary.check) 13 | """ 14 | :class:`~revscoring.languages.features.Dictionary` features via 15 | `enchant.Dict `_ "vi". Provided by `hunspell-vi`. 16 | """ 17 | 18 | # https://vi.wiktionary.org/wiki/Th%C3%A0nh_vi%C3%AAn:Laurent_Bouvier/ 19 | # Free_Vietnamese_Dictionary_Project_Vietnamese-Vietnamese#Allwiki_.28closed.29 20 | stopwords = set([ 21 | "ai", "bằng", "bị", "bộ", "cho", "chưa", "chỉ", "cuối", "cuộc", 22 | "các", "cách", "cái", "có", "cùng", "cũng", "cạnh", "cả", "cục", 23 | "của", "dùng", "dưới", "dừng", "giữa", "gì", "hay", "hoặc", 24 | "khi", "khác", "không", "luôn", "là", "làm", "lại", "mà", "mọi", 25 | "mỗi", "một", "nhiều", "như", "nhưng", "nào", "này", "nữa", 26 | "phải", "qua", "quanh", "quá", "ra", "rất", "sau", "sẽ", "sự", 27 | "theo", "thành", "thêm", "thì", "thứ", "trong", "trên", "trước", 28 | "trừ", "tuy", "tìm", "từng", "và", "vài", "vào", "vì", "vẫn", 29 | "về", "với", "xuống", "đang", "đã", "được", "đấy", "đầu", "đủ" 30 | ]) 31 | 32 | stopwords = Stopwords(name + ".stopwords", stopwords) 33 | """ 34 | :class:`~revscoring.languages.features.Stopwords` features copied from 35 | https://vi.wiktionary.org/wiki/Th%C3%A0nh_vi%C3%AAn:Laurent_Bouvier/Free_Vietnamese_Dictionary_Project_Vietnamese-Vietnamese#Allwiki_.28closed.29 36 | """ # noqa 37 | 38 | badword_regexes = [ 39 | # Vietnamese 40 | r"[ck]ặ[tc]", r"[ck]u", r"cứt", r"(dz?|gi)âm", r"đái", r"đéo", r"đ[ụù]", 41 | r"đĩ", r"đ[íị]t", r"ỉa", r"l[ôồ]n", r"trứng" 42 | ] 43 | 44 | badwords = RegexMatches(name + ".badwords", badword_regexes) 45 | """ 46 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 47 | badword detecting regexes. 48 | """ 49 | 50 | informal_regexes = [ 51 | # Vietnamese 52 | r"bợn", r"bro", 53 | r"chẳng", r"ch[ớứ]", r"cú", 54 | r"đụ", r"đừng", r"fải", 55 | r"khỉ", 56 | r"mày", r"nghịch", r"ngu", r"ngụy", r"nguỵ", 57 | r"ok", r"ơi", 58 | r"quái", 59 | r"thằng", r"thôi", r"tui", r"ừ", r"vời", r"wái?", 60 | r"zì" 61 | ] 62 | 63 | informals = RegexMatches(name + ".informals", informal_regexes) 64 | """ 65 | :class:`~revscoring.languages.features.RegexMatches` features via a list of 66 | informal word detecting regexes. 67 | """ 68 | -------------------------------------------------------------------------------- /revscoring/utilities/test_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``revscoring test_model -h`` 3 | :: 4 | 5 | Tests a scorer model. This utility expects to get a file of 6 | tab-separated feature values and labels from which to test a model. 7 | 8 | Usage: 9 | test_model -h | --help 10 | test_model