├── tests
    ├── __init__.py
    ├── extractors
    │   ├── __init__.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_extractor.py
    │   │   │   ├── test_util.py
    │   │   │   └── test_datasources.py
    │   └── test_extractor.py
    ├── features
    │   ├── __init__.py
    │   ├── bytes
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_revision_oriented.py
    │   ├── meta
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_vectorizers.py
    │   │   │   └── test_bools.py
    │   ├── temporal
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   └── __init__.py
    │   ├── wikibase
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   └── test_util.py
    │   ├── wikitext
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   └── __init__.py
    │   ├── test_modifiers.py
    │   └── test_functions.py
    ├── languages
    │   ├── __init__.py
    │   ├── features
    │   │   ├── __init__.py
    │   │   ├── dictionary
    │   │   │   ├── __init__.py
    │   │   │   └── tests
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── test_util.py
    │   │   ├── stemmed
    │   │   │   ├── __init__.py
    │   │   │   └── tests
    │   │   │   │   └── __init__.py
    │   │   ├── stopwords
    │   │   │   ├── __init__.py
    │   │   │   └── tests
    │   │   │   │   └── __init__.py
    │   │   └── matches
    │   │   │   ├── tests
    │   │   │       └── __init__.py
    │   │   │   └── test_substrings.py
    │   ├── util.py
    │   ├── test_basque.py
    │   ├── test_hebrew.py
    │   ├── test_japanese.py
    │   ├── test_korean.py
    │   ├── test_persian.py
    │   └── test_finnish.py
    ├── scoring
    │   ├── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_util.py
    │   │   │   ├── test_model.py
    │   │   │   ├── test_random_forest.py
    │   │   │   ├── test_gradient_boosting.py
    │   │   │   ├── test_naive_bayes.py
    │   │   │   └── test_svc.py
    │   ├── statistics
    │   │   ├── __init__.py
    │   │   ├── tests
    │   │   │   └── __init__.py
    │   │   └── classification
    │   │   │   ├── __init__.py
    │   │   │   └── tests
    │   │   │       ├── __init__.py
    │   │   │       ├── test_counts.py
    │   │   │       ├── test_threshold_optimization.py
    │   │   │       ├── test_multilabel_counts.py
    │   │   │       ├── test_micro_macro_stats.py
    │   │   │       └── test_rates.py
    │   ├── test_environment.py
    │   ├── test_model_info.py
    │   ├── test_util.py
    │   └── test_labels.py
    ├── utilities
    │   ├── __init__.py
    │   ├── data
    │   │   ├── labeled_foo.json
    │   │   └── labeled_revisions.json
    │   ├── test_fetch_idioms.py
    │   ├── test_union_intersect_observations.py
    │   ├── test_util.py
    │   └── test_union_merge_observations.py
    ├── datasources
    │   ├── __init__.py
    │   ├── meta
    │   │   ├── __init__.py
    │   │   └── tests
    │   │   │   ├── __init__.py
    │   │   │   ├── test_gramming.py
    │   │   │   ├── test_hashing.py
    │   │   │   ├── test_dicts.py
    │   │   │   ├── test_vectorizers.py
    │   │   │   ├── test_extractors.py
    │   │   │   ├── test_filters.py
    │   │   │   ├── test_mappers.py
    │   │   │   └── test_frequencies.py
    │   ├── util.py
    │   └── test_datasource.py
    ├── dependencies
    │   ├── __init__.py
    │   ├── test_context.py
    │   └── test_dependent.py
    └── test_score_processor.py
├── docs
    ├── changelog.rst
    ├── requirements.txt
    ├── revscoring.scoring.rst
    ├── revscoring.features.rst
    ├── revscoring.languages.rst
    ├── revscoring.utilities.rst
    ├── revscoring.datasources.rst
    ├── revscoring.extractors.rst
    ├── revscoring.dependencies.rst
    ├── revscoring.features.meta.rst
    ├── revscoring.features.bytes.rst
    ├── revscoring.scoring.models.rst
    ├── revscoring.datasources.meta.rst
    ├── revscoring.features.temporal.rst
    ├── revscoring.features.wikibase.rst
    ├── revscoring.features.wikitext.rst
    ├── revscoring.features.modifiers.rst
    ├── revscoring.languages.features.rst
    ├── revscoring.scoring.statistics.rst
    ├── revscoring.features.revision_oriented.rst
    ├── revscoring.datasources.revision_oriented.rst
    ├── index.rst
    ├── notes
    │   └── 2015-10-07.language_structure.txt
    ├── api_reference.rst
    └── notes_on_adhoc_jobs.txt
├── MANIFEST.in
├── utility
├── revscoring
    ├── features
    │   ├── util.py
    │   ├── wikibase
    │   │   ├── datasources
    │   │   │   └── __init__.py
    │   │   ├── features
    │   │   │   └── __init__.py
    │   │   ├── revision_oriented.py
    │   │   ├── __init__.py
    │   │   └── util.py
    │   ├── wikitext
    │   │   ├── datasources
    │   │   │   ├── __init__.py
    │   │   │   ├── revision_oriented.py
    │   │   │   └── sentences.py
    │   │   ├── features
    │   │   │   ├── __init__.py
    │   │   │   └── revision_oriented.py
    │   │   └── revision_oriented.py
    │   ├── bytes
    │   │   ├── __init__.py
    │   │   ├── datasources.py
    │   │   └── revision_oriented.py
    │   ├── meta
    │   │   ├── __init__.py
    │   │   └── vectorizers.py
    │   ├── temporal
    │   │   └── __init__.py
    │   ├── feature_vector.py
    │   ├── modifiers.py
    │   ├── functions.py
    │   └── __init__.py
    ├── extractors
    │   ├── api
    │   │   ├── __init__.py
    │   │   └── util.py
    │   └── __init__.py
    ├── about.py
    ├── dependencies
    │   ├── util.py
    │   └── __init__.py
    ├── languages
    │   ├── basque.py
    │   ├── features
    │   │   ├── stemmed
    │   │   │   ├── __init__.py
    │   │   │   ├── stemmed.py
    │   │   │   └── datasources.py
    │   │   ├── stopwords
    │   │   │   ├── __init__.py
    │   │   │   └── stopwords.py
    │   │   ├── dictionary
    │   │   │   ├── __init__.py
    │   │   │   ├── dictionary.py
    │   │   │   └── util.py
    │   │   ├── __init__.py
    │   │   └── matches
    │   │   │   ├── __init__.py
    │   │   │   ├── substring_matches.py
    │   │   │   ├── regex_matches.py
    │   │   │   └── matches.py
    │   ├── korean.py
    │   ├── japanese.py
    │   ├── finnish.py
    │   ├── vietnamese.py
    │   ├── romanian.py
    │   └── hebrew.py
    ├── scoring
    │   ├── models
    │   │   ├── gradient_boosting.py
    │   │   ├── random_forest.py
    │   │   ├── linear.py
    │   │   ├── svc.py
    │   │   ├── naive_bayes.py
    │   │   ├── util.py
    │   │   └── __init__.py
    │   ├── statistics
    │   │   ├── __init__.py
    │   │   ├── statistics.py
    │   │   └── classification
    │   │   │   ├── label_thresholds.py
    │   │   │   ├── __init__.py
    │   │   │   └── rates.py
    │   ├── __init__.py
    │   └── util.py
    ├── datasources
    │   ├── meta
    │   │   ├── timestamp.py
    │   │   ├── __init__.py
    │   │   ├── indexable.py
    │   │   ├── hashing.py
    │   │   ├── gramming.py
    │   │   └── dicts.py
    │   ├── datasource.py
    │   └── __init__.py
    ├── utilities
    │   ├── check_model.py
    │   ├── model_info.py
    │   ├── __init__.py
    │   ├── intersect_merge_observations.py
    │   ├── union_merge_observations.py
    │   ├── test_model.py
    │   ├── fetch_idioms.py
    │   └── fit.py
    └── revscoring.py
├── .codecov.yml
├── scripts
    └── deploy.sh
├── CODE_OF_CONDUCT.md
├── setup.cfg
├── test-requirements.txt
├── config
    ├── logistic_regression.params.yaml
    ├── random_forest.params.yaml
    ├── gradient_boost.params.yaml
    ├── naive_bayes.params.yaml
    └── svc.params.yaml
├── pytest.ini
├── tox.ini
├── examples
    ├── scoring.py
    ├── language_support.py
    └── extraction.py
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── ci.yml
    │   └── publish_python_package.yml
├── requirements.txt
├── Dockerfile
├── .gitignore
├── Makefile
├── LICENSE
├── setup.py
└── .travis.yml


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scoring/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/utilities/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/datasources/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/dependencies/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/extractors/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/bytes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/meta/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scoring/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/extractors/api/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/bytes/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/meta/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/temporal/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/wikibase/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/wikitext/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/features/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scoring/models/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/temporal/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/wikibase/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/features/wikitext/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/features/dictionary/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/features/stemmed/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/features/stopwords/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. mdinclude:: ../CHANGELOG.md
2 | 


--------------------------------------------------------------------------------
/tests/languages/features/dictionary/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/features/matches/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/features/stemmed/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/languages/features/stopwords/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/classification/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/classification/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx==2.4.4
2 | sphinx-py3doc-enhanced-theme
3 | m2r
4 | -e .
5 | 


--------------------------------------------------------------------------------
/tests/utilities/data/labeled_foo.json:
--------------------------------------------------------------------------------
1 | {"foo": 1, "goodfaith": 0, "rev_id": "16124390"}
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE README.md requirements.txt 
2 | include revscoring/assets/*.txt
3 | 


--------------------------------------------------------------------------------
/utility:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from revscoring import revscoring
3 | 
4 | revscoring.main()
5 | 


--------------------------------------------------------------------------------
/revscoring/features/util.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | SECTION_COMMENT_RE = re.compile(r"\/\*([^\*]|\*[^\/])+\*\/")
4 | 


--------------------------------------------------------------------------------
/docs/revscoring.scoring.rst:
--------------------------------------------------------------------------------
1 | revscoring.scoring
2 | ==================
3 | 
4 | .. automodule:: revscoring.scoring
5 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
1 | codecov:
2 |   notify:
3 |     require_ci_to_pass: no
4 | 
5 | coverage:
6 |   status:
7 |     patch: off
8 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.rst:
--------------------------------------------------------------------------------
1 | revscoring.features
2 | ===================
3 | 
4 | .. automodule:: revscoring.features
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.languages.rst:
--------------------------------------------------------------------------------
1 | revscoring.languages
2 | ====================
3 | 
4 | .. automodule:: revscoring.languages
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.utilities.rst:
--------------------------------------------------------------------------------
1 | revscoring.utilities
2 | ====================
3 | 
4 | .. automodule:: revscoring.utilities
5 | 


--------------------------------------------------------------------------------
/revscoring/features/wikibase/datasources/__init__.py:
--------------------------------------------------------------------------------
1 | from .revision_oriented import Revision
2 | 
3 | __all__ = [Revision]
4 | 


--------------------------------------------------------------------------------
/revscoring/features/wikitext/datasources/__init__.py:
--------------------------------------------------------------------------------
1 | from .revision_oriented import Revision
2 | 
3 | __all__ = [Revision]
4 | 


--------------------------------------------------------------------------------
/docs/revscoring.datasources.rst:
--------------------------------------------------------------------------------
1 | revscoring.datasources
2 | ======================
3 | 
4 | .. automodule:: revscoring.datasources
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.extractors.rst:
--------------------------------------------------------------------------------
1 | revscoring.extractors
2 | =====================
3 | 
4 | .. automodule:: revscoring.extractors
5 | 


--------------------------------------------------------------------------------
/revscoring/extractors/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .extractor import Extractor, MWAPICache
2 | 
3 | __all__ = [Extractor, MWAPICache]
4 | 


--------------------------------------------------------------------------------
/docs/revscoring.dependencies.rst:
--------------------------------------------------------------------------------
1 | revscoring.dependencies
2 | =======================
3 | 
4 | .. automodule:: revscoring.dependencies
5 | 


--------------------------------------------------------------------------------
/revscoring/features/wikitext/features/__init__.py:
--------------------------------------------------------------------------------
1 | from .revision_oriented import Diff, Revision
2 | 
3 | __all__ = [Revision, Diff]
4 | 


--------------------------------------------------------------------------------
/scripts/deploy.sh:
--------------------------------------------------------------------------------
1 | python setup.py sdist bdist_wheel && twine upload dist/* --skip-existing --username $PYPI_USER --password $PYPI_PASS
2 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | The development of this software is covered by a [Code of Conduct](https://www.mediawiki.org/wiki/Code_of_Conduct).
2 | 
3 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.meta.rst:
--------------------------------------------------------------------------------
1 | revscoring.features.meta
2 | ========================
3 | 
4 | .. automodule:: revscoring.features.meta
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal = 1
3 | 
4 | [build_sphinx]
5 | source-dir = doc/
6 | build-dir  = doc/_build
7 | all_files  = 1
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.bytes.rst:
--------------------------------------------------------------------------------
1 | revscoring.features.bytes
2 | =========================
3 | 
4 | .. automodule:: revscoring.features.bytes
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.scoring.models.rst:
--------------------------------------------------------------------------------
1 | revscoring.scoring.models
2 | =========================
3 | 
4 | .. automodule:: revscoring.scoring.models
5 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest >= 4.6.0, < 4.6.999
2 | codecov
3 | pytest-cov
4 | google-compute-engine <= 2.7.999
5 | flake8 >= 3.8.1, < 3.8.999
6 | 


--------------------------------------------------------------------------------
/docs/revscoring.datasources.meta.rst:
--------------------------------------------------------------------------------
1 | revscoring.datasources.meta
2 | ===========================
3 | 
4 | .. automodule:: revscoring.datasources.meta
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.temporal.rst:
--------------------------------------------------------------------------------
1 | revscoring.features.temporal
2 | ============================
3 | 
4 | .. automodule:: revscoring.features.temporal
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.wikibase.rst:
--------------------------------------------------------------------------------
1 | revscoring.features.wikibase
2 | ============================
3 | 
4 | .. automodule:: revscoring.features.wikibase
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.wikitext.rst:
--------------------------------------------------------------------------------
1 | revscoring.features.wikitext
2 | ============================
3 | 
4 | .. automodule:: revscoring.features.wikitext
5 | 


--------------------------------------------------------------------------------
/revscoring/features/wikibase/features/__init__.py:
--------------------------------------------------------------------------------
1 | from .diff import Diff
2 | from .revision_oriented import Revision
3 | 
4 | __all__ = [Revision, Diff]
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.modifiers.rst:
--------------------------------------------------------------------------------
1 | revscoring.features.modifiers
2 | =============================
3 | 
4 | .. automodule:: revscoring.features.modifiers
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.languages.features.rst:
--------------------------------------------------------------------------------
1 | revscoring.languages.features
2 | =============================
3 | 
4 | .. automodule:: revscoring.languages.features
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.scoring.statistics.rst:
--------------------------------------------------------------------------------
1 | revscoring.scoring.statistics
2 | =============================
3 | 
4 | .. automodule:: revscoring.scoring.statistics
5 | 


--------------------------------------------------------------------------------
/config/logistic_regression.params.yaml:
--------------------------------------------------------------------------------
1 | LogisticRegression:
2 |   class: revscoring.scoring.models.LogisticRegression
3 |   params:
4 |     penalty: ["l1", "l2"]
5 |     C: [0.1, 1, 10]
6 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     nottravis: marks tests as being known to fail in travis for unconcerning reasons
4 | addopts = --ignore=revscoring/utilities/test_model.py
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.features.revision_oriented.rst:
--------------------------------------------------------------------------------
1 | revscoring.features.revision_oriented
2 | =====================================
3 | 
4 | .. automodule:: revscoring.features.revision_oriented
5 | 


--------------------------------------------------------------------------------
/docs/revscoring.datasources.revision_oriented.rst:
--------------------------------------------------------------------------------
1 | revscoring.datasources.revision_oriented
2 | ========================================
3 | 
4 | .. automodule:: revscoring.datasources.revision_oriented
5 | 


--------------------------------------------------------------------------------
/tests/datasources/util.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | 
3 | from revscoring.datasources.datasource import Datasource
4 | 
5 | 
6 | def check_datasource(ds):
7 |     assert isinstance(ds, Datasource)
8 |     assert pickle.loads(pickle.dumps(ds)) == ds
9 | 


--------------------------------------------------------------------------------
/tests/languages/features/dictionary/tests/test_util.py:
--------------------------------------------------------------------------------
1 | 
2 | from revscoring.languages.features.dictionary.util import utf16_cleanup
3 | 
4 | 
5 | def test_utf16_cleanup():
6 |     assert (utf16_cleanup("Foobar" + chr(2 ** 16)) ==
7 |             "Foobar\uFFFD")
8 | 


--------------------------------------------------------------------------------
/config/random_forest.params.yaml:
--------------------------------------------------------------------------------
1 | RandomForest:
2 |   class: revscoring.scoring.models.RandomForest
3 |   params:
4 |     n_estimators: [128, 256, 512, 1024]
5 |     min_samples_leaf: [1, 3, 5, 7, 13]
6 |     max_features: ["log2"]
7 |     criterion: ["gini", "entropy"]
8 | 


--------------------------------------------------------------------------------
/config/gradient_boost.params.yaml:
--------------------------------------------------------------------------------
1 | GradientBoosting:
2 |   class: revscoring.scoring.models.GradientBoosting
3 |   params:
4 |     n_estimators: [500, 700, 1200, 1500, 2000]
5 |     max_depth: [5, 7, 9, 11]
6 |     max_features: ["log2"]
7 |     learning_rate: [0.001, 0.01, 0.1,  0.5]
8 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Revision Scoring
 2 | ================
 3 | .. automodule:: revscoring
 4 | 
 5 | Project Info:
 6 | 
 7 | .. toctree::
 8 |   :maxdepth: 1
 9 | 
10 |   changelog
11 | 
12 | 
13 | Indices and tables
14 | ==================
15 | 
16 | * :ref:`genindex`
17 | * :ref:`modindex`
18 | * :ref:`search`
19 | 


--------------------------------------------------------------------------------
/config/naive_bayes.params.yaml:
--------------------------------------------------------------------------------
 1 | GaussianNB:
 2 |   class: revscoring.scoring.models.GaussianNB
 3 |   params: {}
 4 | BernoulliNB:
 5 |   class: revscoring.scoring.models.BernoulliNB
 6 |   params: {}
 7 | MultinomialNB:
 8 |   class: revscoring.scoring.models.MultinomialNB
 9 |   params:
10 |   alpha: [0.1, 1, 10]
11 | 


--------------------------------------------------------------------------------
/revscoring/features/wikitext/revision_oriented.py:
--------------------------------------------------------------------------------
 1 | from revscoring.datasources import revision_oriented
 2 | 
 3 | from . import datasources
 4 | from .features import Revision
 5 | 
 6 | name = "wikitext.revision"
 7 | 
 8 | revision = Revision(
 9 |     name,
10 |     datasources.Revision(name, revision_oriented.revision)
11 | )
12 | 


--------------------------------------------------------------------------------
/revscoring/about.py:
--------------------------------------------------------------------------------
1 | __name__ = "revscoring"
2 | __version__ = "2.11.13"
3 | __author__ = "Aaron Halfaker"
4 | __author_email__ = "ahalfaker@wikimedia.org"
5 | __description__ = ("A set of utilities for generating quality scores for" +
6 |                    " MediaWiki revisions")
7 | __url__ = "https://github.com/wikimedia/revscoring"
8 | __license__ = "MIT"
9 | 


--------------------------------------------------------------------------------
/config/svc.params.yaml:
--------------------------------------------------------------------------------
 1 | SVC:
 2 |   class: sklearn.svm.SVC
 3 |   params:
 4 |      -
 5 |       kernel: ["rbf"]
 6 |       probability: [true]
 7 |       gamma: [0.0, 0.001, 0.0001]
 8 |       cache_size: [1000]
 9 |       C: [0.1, 1, 10]
10 |      -
11 |       kernel: ["linear"]
12 |       probability: [true]
13 |       cache_size: [1000]
14 |       C: [0.1, 1, 10]
15 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | exclude =
 3 |     # This file is code-generated by Sphinx, so we don't care.
 4 |     doc/conf.py
 5 | ignore = E113,
 6 |     E111,
 7 |     E126,
 8 |     E127,
 9 |     E131,
10 |     E305,
11 |     # E501 line too long (> 79 characters)
12 |     E501
13 |     W504
14 |     E741
15 | 
16 | [pytest]
17 | addopts = --ignore revscoring/utilities/test_model.py
18 | 


--------------------------------------------------------------------------------
/revscoring/features/bytes/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This features module provides access to features of the bytes of content in
 3 | revisions.
 4 | 
 5 | .. autodata:: revscoring.features.bytes.revision
 6 | 
 7 | Supporting classes
 8 | ++++++++++++++++++
 9 | 
10 | .. autoclass:: revscoring.features.bytes.Revision
11 |     :members:
12 | 
13 | """
14 | from .revision_oriented import Revision, revision
15 | 
16 | __all__ = [revision, Revision]
17 | 


--------------------------------------------------------------------------------
/examples/scoring.py:
--------------------------------------------------------------------------------
 1 | import mwapi
 2 | 
 3 | from revscoring import Model
 4 | from revscoring.extractors import api
 5 | 
 6 | with open("models/enwiki.damaging.linear_svc.model") as f:
 7 |     model = Model.load(f)
 8 | 
 9 | extractor = api.Extractor(mwapi.Session(host="https://en.wikipedia.org",
10 |                                          user_agent="revscoring demo"))
11 | values = extractor.extract(123456789, model.features)
12 | print(model.score(values))
13 | 


--------------------------------------------------------------------------------
/revscoring/extractors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains a collection of utilities for extracting
 3 | :class:`~revscoring.Feature` and
 4 | :class:`~revscoring.Datasource` for a revision.
 5 | 
 6 | api
 7 | +++
 8 | .. automodule:: revscoring.extractors.api
 9 | 
10 | extractor
11 | +++++++++
12 | .. automodule:: revscoring.extractors.extractor
13 | 
14 | """
15 | from .extractor import Extractor, OfflineExtractor
16 | 
17 | __all__ = [Extractor, OfflineExtractor]
18 | 


--------------------------------------------------------------------------------
/tests/utilities/data/labeled_revisions.json:
--------------------------------------------------------------------------------
1 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124390"}
2 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124357"}
3 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16123622"}
4 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124436"}
5 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124458"}
6 | {"goodfaith": 1, "approved": 1, "damaging": 0, "rev_id": "16124488"}
7 | 


--------------------------------------------------------------------------------
/examples/language_support.py:
--------------------------------------------------------------------------------
 1 | from revscoring.datasources.revision_oriented import revision
 2 | from revscoring.dependencies import solve
 3 | from revscoring.languages import english, spanish
 4 | 
 5 | features = [english.informals.revision.matches,
 6 |              spanish.informals.revision.matches]
 7 | values = solve(features, cache={revision.text: "I think it is stupid."})
 8 | 
 9 | for feature, value in zip(features, values):
10 |     print("\t{0}: {1}".format(feature, repr(value)))
11 | 


--------------------------------------------------------------------------------
/tests/datasources/test_datasource.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources.datasource import Datasource
 4 | from revscoring.dependencies import solve
 5 | 
 6 | 
 7 | def test_datasource():
 8 | 
 9 |     d = Datasource("d")
10 | 
11 |     assert pickle.loads(pickle.dumps(d)) == d
12 | 
13 |     assert solve(d, cache={d: "foo"}) == "foo"
14 | 
15 |     assert solve(d, cache={"datasource.d": "foo"}) == "foo"
16 | 
17 |     assert str(d) == "datasource.d"
18 |     assert repr(d) == "<datasource.d>"
19 | 


--------------------------------------------------------------------------------
/tests/scoring/models/tests/test_util.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | from revscoring.scoring.models.util import normalize_json
 4 | 
 5 | 
 6 | def test_normalize_json():
 7 |     doc = {"foo": {numpy.bool_(True): "value"},
 8 |            "what": numpy.bool_(False),
 9 |            "this": numpy.PINF}
10 |     normalized_doc = normalize_json(doc)
11 |     assert isinstance(normalized_doc['what'], bool)
12 |     assert isinstance(list(normalized_doc['foo'].keys())[0], bool)
13 |     assert normalized_doc['this'] == "Infinity"
14 | 


--------------------------------------------------------------------------------
/revscoring/features/meta/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Meta-Features are generalized :class:`revscoring.Datasource` -->
 3 | :class:`revscoring.Feature` conversions implemented for convenience.
 4 | 
 5 | aggregators
 6 | +++++++++++
 7 | .. automodule:: revscoring.features.meta.aggregators
 8 |     :members:
 9 | 
10 | bools
11 | +++++
12 | .. automodule:: revscoring.features.meta.bools
13 |     :members:
14 | 
15 | vectorizers
16 | +++++++++++
17 | .. automodule:: revscoring.features.meta.vectorizers
18 |     :members:
19 | 
20 | """
21 | 


--------------------------------------------------------------------------------
/tests/extractors/api/tests/test_extractor.py:
--------------------------------------------------------------------------------
 1 | from revscoring.extractors.api.extractor import Extractor
 2 | 
 3 | 
 4 | def test_from_config():
 5 |     config = {
 6 |         'extractors': {
 7 |             'enwiki': {
 8 |                 'host': "https://en.wikipedia.org",
 9 |                 'api_path': "/w/api.php",
10 |                 'timeout': 20,
11 |                 'user_agent': "revscoring tests"
12 |             }
13 |         }
14 |     }
15 | 
16 |     Extractor.from_config(config, 'enwiki')  # Doesn't error
17 | 


--------------------------------------------------------------------------------
/revscoring/dependencies/util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class or_none:
 3 |     """
 4 |     Constructs a callable that will return None if the input is None, but will
 5 |     otherwise run a function on incoming data.
 6 | 
 7 |     :Parameters:
 8 |         func : `function`
 9 |             A function to run on non-None inputs
10 |     """
11 |     def __init__(self, func):
12 |         self.func = func
13 | 
14 |     def __call__(self, val):
15 |         if val is None:
16 |             return None
17 |         else:
18 |             return self.func(val)
19 | 


--------------------------------------------------------------------------------
/revscoring/languages/basque.py:
--------------------------------------------------------------------------------
 1 | from .features import Dictionary
 2 | 
 3 | name = "basque"
 4 | 
 5 | try:
 6 |     import enchant
 7 |     dictionary = enchant.Dict("eu")
 8 | except enchant.errors.DictNotFoundError:
 9 |     raise ImportError("No enchant-compatible dictionary found for 'eu'.  " +
10 |                       "Consider installing 'hunspell-eu'.")
11 | 
12 | dictionary = Dictionary(name + ".dictionary", dictionary.check)
13 | """
14 | :class:`~revscoring.languages.features.Dictionary` features via
15 | :class:`enchant.Dict` "eu". Provided by `hunspell-eu`
16 | """
17 | 


--------------------------------------------------------------------------------
/tests/test_score_processor.py:
--------------------------------------------------------------------------------
 1 | from revscoring import Model
 2 | from revscoring.extractors import OfflineExtractor
 3 | from revscoring.features import Constant
 4 | from revscoring.score_processor import ScoreProcessor
 5 | 
 6 | 
 7 | class FakeModel(Model):
 8 | 
 9 |     def score(featue_values):
10 |         return not featue_values[0]
11 | 
12 | 
13 | def test_score_processor():
14 | 
15 |     model = FakeModel([Constant(False)])
16 | 
17 |     sp = ScoreProcessor(model, OfflineExtractor())
18 |     scores = sp.score([1, 2, 3])
19 | 
20 |     for score in scores:
21 |         assert score
22 | 


--------------------------------------------------------------------------------
/revscoring/features/wikibase/revision_oriented.py:
--------------------------------------------------------------------------------
 1 | from revscoring.datasources import revision_oriented
 2 | 
 3 | from . import datasources, features
 4 | 
 5 | name = "wikibase.revision"
 6 | 
 7 | revision = features.Revision(
 8 |     name,
 9 |     datasources.Revision(name, revision_oriented.revision)
10 | )
11 | """
12 | Represents the base revision of interest.  Implements this basic structure:
13 | 
14 | * revision: :class:`~revscoring.features.wikibase.Revision`
15 |     * parent: :class:`~revscoring.features.wikibase.Revision`
16 |     * diff: :class:`~revscoring.features.wikibase.Diff`
17 | """
18 | 


--------------------------------------------------------------------------------
/revscoring/scoring/models/gradient_boosting.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A collection of Gradient Boosting type classifier models.
 3 | 
 4 | .. autoclass:: revscoring.scoring.models.GradientBoosting
 5 |     :members:
 6 |     :member-order:
 7 | """
 8 | import logging
 9 | 
10 | from sklearn.ensemble import GradientBoostingClassifier
11 | 
12 | from .sklearn import ProbabilityClassifier
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class GradientBoosting(ProbabilityClassifier):
18 |     """
19 |     Implements a Gradient Boosting model.
20 |     """
21 |     Estimator = GradientBoostingClassifier
22 | 


--------------------------------------------------------------------------------
/tests/utilities/test_fetch_idioms.py:
--------------------------------------------------------------------------------
 1 | from revscoring.utilities.fetch_idioms import is_idiom
 2 | 
 3 | 
 4 | def test_is_idiom():
 5 |     phrases = [
 6 |         'Appendix:English 19th Century idioms',
 7 |         'about to',
 8 |         'activist justice',
 9 |         'attaboy',
10 |         'bat for the other team',
11 |         'beard the lion in his den',
12 |         'as gentle as a dove',
13 |         'I\'ll say'
14 |     ]
15 |     idioms = [phrase for phrase in phrases if is_idiom(phrase)]
16 | 
17 |     assert idioms == ['bat for the other team', 'beard the lion in his den']
18 | 


--------------------------------------------------------------------------------
/revscoring/scoring/models/random_forest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A collection of Random Forest type classifier models.
 3 | 
 4 | .. autoclass:: revscoring.scoring.models.RandomForest
 5 |     :members:
 6 |     :member-order:
 7 | """
 8 | import logging
 9 | 
10 | from sklearn.ensemble import RandomForestClassifier
11 | 
12 | from .sklearn import ProbabilityClassifier
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class RandomForest(ProbabilityClassifier):
18 |     """
19 |     Implements a Random Forest model.
20 |     """
21 |     Estimator = RandomForestClassifier
22 |     SUPPORTS_CLASSWEIGHT = True
23 | 


--------------------------------------------------------------------------------
/examples/extraction.py:
--------------------------------------------------------------------------------
 1 | from mwapi import Session
 2 | 
 3 | from revscoring.extractors import api
 4 | from revscoring.features import temporal, wikitext
 5 | 
 6 | session = Session("https://en.wikipedia.org/w/api.php", user_agent="test")
 7 | api_extractor = api.Extractor(session)
 8 | 
 9 | features = [temporal.revision.day_of_week,
10 |             temporal.revision.hour_of_day,
11 |             wikitext.revision.parent.headings_by_level(2)]
12 | 
13 | values = api_extractor.extract(624577024, features)
14 | for feature, value in zip(features, values):
15 |     print("\t{0}: {1}".format(feature, repr(value)))
16 | 


--------------------------------------------------------------------------------
/revscoring/features/bytes/datasources.py:
--------------------------------------------------------------------------------
 1 | from revscoring.datasources import Datasource
 2 | 
 3 | 
 4 | class Revision:
 5 | 
 6 |     def __init__(self, prefix, revision_datasources):
 7 | 
 8 |         self.bytes = Datasource(
 9 |             prefix + ".bytes", _process_bytes,
10 |             depends_on=[revision_datasources.text]
11 |         )
12 | 
13 |         if hasattr(revision_datasources, "parent"):
14 |             self.parent = Revision(
15 |                 prefix + ".parent",
16 |                 revision_datasources.parent
17 |             )
18 | 
19 | 
20 | def _process_bytes(text):
21 |     return bytes(text, 'utf-8', 'replace')
22 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/stemmed/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements a feature set based off of stemmer applied to words.
 3 | 
 4 | .. autoclass:: revscoring.languages.features.Stemmed
 5 |     :members:
 6 |     :member-order: bysource
 7 | 
 8 | Supporting classes
 9 | ------------------
10 | 
11 | .. autoclass:: revscoring.languages.features.stemmed.Revision
12 |     :members:
13 |     :member-order: bysource
14 | 
15 | .. autoclass:: revscoring.languages.features.stemmed.Diff
16 |     :members:
17 |     :member-order: bysource
18 | """
19 | from .features import Diff, Revision
20 | from .stemmed import Stemmed
21 | 
22 | __all__ = [Stemmed, Revision, Diff]
23 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_gramming.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources.datasource import Datasource
 4 | from revscoring.datasources.meta import gramming
 5 | from revscoring.dependencies import solve
 6 | 
 7 | my_tokens = Datasource("my_tokens")
 8 | my_grams = gramming.gram(my_tokens, grams=[(0,), (0, 2)])
 9 | 
10 | 
11 | def test_gramming():
12 |     assert (solve(my_grams, cache={my_tokens: ["one", "two", "three", "four"]}) ==
13 |             [("one",), ("one", "three"), ("two",), ("two", "four"), ("three",),
14 |              ("four",)])
15 | 
16 |     assert (pickle.loads(pickle.dumps(my_grams)) ==
17 |             my_grams)
18 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 |     commit-message:
13 |       # Prefix all commit messages with "pip"
14 |       prefix: "pip"
15 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/stopwords/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements a feature set based off of filtering words for stopwords
 3 | 
 4 | .. autoclass:: revscoring.languages.features.Stopwords
 5 |     :members:
 6 |     :member-order: bysource
 7 | 
 8 | Supporting classes
 9 | ------------------
10 | 
11 | .. autoclass:: revscoring.languages.features.stopwords.Revision
12 |     :members:
13 |     :member-order: bysource
14 | 
15 | .. autoclass:: revscoring.languages.features.stopwords.Diff
16 |     :members:
17 |     :member-order: bysource
18 | """
19 | from .features import Diff, Revision
20 | from .stopwords import Stopwords
21 | 
22 | __all__ = [Stopwords, Revision, Diff]
23 | 


--------------------------------------------------------------------------------
/tests/features/wikibase/tests/test_util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from revscoring.features.wikibase.util import diff_dicts
 3 | 
 4 | 
 5 | def test_diff_dicts():
 6 | 
 7 |     diff = diff_dicts(None, {'a': 1, 'b': 2})
 8 |     assert diff.added == {'a', 'b'}
 9 |     assert diff.removed == set()
10 |     assert diff.intersection == set()
11 |     assert diff.changed == set()
12 |     assert diff.unchanged == set()
13 | 
14 |     diff = diff_dicts({'a': 1, 'b': 2, 'c': 3}, {'a': 1, 'b': 3, 'd': 10})
15 |     assert diff.added == {'d'}
16 |     assert diff.removed == {'c'}
17 |     assert diff.intersection == {'a', 'b'}
18 |     assert diff.changed == {'b'}
19 |     assert diff.unchanged == {'a'}
20 | 


--------------------------------------------------------------------------------
/revscoring/features/wikibase/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This features module provides access to features of the bytes of content in
 3 | revisions.
 4 | 
 5 | .. autodata:: revscoring.features.wikibase.revision
 6 | 
 7 | Supporting classes
 8 | ++++++++++++++++++
 9 | 
10 | .. autoclass:: revscoring.features.wikibase.Revision
11 |     :members:
12 |     :member-order: bysource
13 | 
14 | .. autoclass:: revscoring.features.wikibase.Diff
15 |     :members:
16 |     :member-order: bysource
17 | 
18 | """
19 | from .features import Diff, Revision
20 | from .revision_oriented import revision
21 | from .util import DictDiff, diff_dicts
22 | 
23 | __all__ = [diff_dicts, DictDiff, revision, Revision, Diff]
24 | 


--------------------------------------------------------------------------------
/revscoring/dependencies/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module provides a general set of utilities for implementing a set of
 3 | dependencies, solving them and injecting context and cache.
 4 | 
 5 | .. automodule:: revscoring.dependencies.dependent
 6 | 
 7 | functions
 8 | +++++++++
 9 | .. automodule:: revscoring.dependencies.functions
10 | 
11 | context
12 | +++++++
13 | .. automodule:: revscoring.dependencies.context
14 | """
15 | 
16 | from .context import Context
17 | from .dependent import Dependent, DependentSet
18 | from .functions import dig, draw, expand, normalize_context, solve
19 | 
20 | __all__ = [solve, expand, dig, draw, normalize_context, Context, Dependent,
21 |            DependentSet]
22 | 


--------------------------------------------------------------------------------
/tests/utilities/test_union_intersect_observations.py:
--------------------------------------------------------------------------------
 1 | from revscoring.utilities.intersect_merge_observations import \
 2 |     intersect_merge_observations
 3 | 
 4 | 
 5 | def test_intersect_merge():
 6 |     """Merge and inspect results.
 7 |     """
 8 |     a = [
 9 |         {"rev_id": 101, "goodfaith": False, "damaging": True},
10 |         {"rev_id": 102, "goodfaith": False, "damaging": False},
11 |     ]
12 |     b = [
13 |         {"rev_id": 101, "goodfaith": True, "damaging": True}
14 |     ]
15 |     expected = [
16 |         {"rev_id": 101, "goodfaith": True, "damaging": True}
17 |     ]
18 |     result = intersect_merge_observations([a, b], "rev_id")
19 |     assert expected == list(result)
20 | 


--------------------------------------------------------------------------------
/revscoring/scoring/statistics/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Statistics represent the fitness of a :class:`revscoring.Model`.  They can
 3 | be :func:`~revscoring.scoring.Statistics.fit` to scores and labels and
 4 | then output using :func:`~revscoring.scoring.Statistics.format`.  Once
 5 | initialize, a :class:`~revscoring.scoring.Statistics` instance behaves like
 6 | a `dict` of statistics values.
 7 | 
 8 | Classification
 9 | ++++++++++++++
10 | .. automodule:: revscoring.scoring.statistics.classification
11 | 
12 | Abstract base class
13 | +++++++++++++++++++
14 | .. automodule:: revscoring.scoring.statistics.statistics
15 | 
16 | """
17 | from .classification import Classification
18 | 
19 | __all__ = [Classification]
20 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython >= 0.28.5
 2 | deep_merge >= 0.0.1, < 0.0.999
 3 | deltas >= 0.7.0, < 0.7.999
 4 | docopt >= 0.6.2, < 0.6.999
 5 | gensim >= 3.8.1
 6 | hanziconv >= 0.3.2, < 0.3.999
 7 | mmh3 >= 2.5.1, < 2.5.999
 8 | more-itertools >= 7.2.0, < 7.2.999
 9 | mwapi >= 0.5.0, < 0.6.999
10 | mwbase >= 0.1.4, < 0.1.999
11 | mwtypes >= 0.2.0, < 0.3.999
12 | mwparserfromhell >= 0.6.5, < 0.6.999
13 | mysqltsv >= 0.0.7, < 0.0.999
14 | nltk >= 3.6.6
15 | numpy >= 1.21.5, < 1.25.0
16 | pytz >= 2017.2
17 | requests >= 2.0.0, < 2.999.999
18 | pyenchant >= 3.2.2
19 | scipy >= 1.5.4, < 1.10.1
20 | scikit-learn >= 1.0.2
21 | tabulate >= 0.9.0, < 0.9.999
22 | tqdm >= 4.15.0, < 4.15.9999
23 | yamlconf-wmf==0.2.10
24 | flashtext==2.7
25 | 


--------------------------------------------------------------------------------
/tests/scoring/test_environment.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from pytest import raises
 4 | 
 5 | from revscoring.about import __version__
 6 | from revscoring.scoring.environment import Environment
 7 | 
 8 | 
 9 | def test_environment():
10 |     env = Environment()
11 | 
12 |     print(env.format(formatting="str"))
13 |     print(json.dumps(env.format(formatting="json"), indent=2))
14 |     assert env.lookup(['revscoring_version']) == __version__
15 |     env.check(raise_exception=True)
16 | 
17 | 
18 | def test_env_error():
19 |     with raises(RuntimeError):
20 |         env = Environment()
21 |         env['revscoring_version'] = "foo"
22 |         print(json.dumps(env.format(formatting="json"), indent=2))
23 |         env.check(raise_exception=True)
24 | 


--------------------------------------------------------------------------------
/docs/notes/2015-10-07.language_structure.txt:
--------------------------------------------------------------------------------
 1 | - revscoring
 2 |   - features
 3 |     - revision
 4 |       - words
 5 |   - languages
 6 |     - english (Language, RegexBadwords, RegexInformals, Dictionary)
 7 |       - revision
 8 |         - badwords
 9 |         - proportion_of_badwords = english.revision.badwords / max(revision.words, 1)
10 |         - informals
11 |         - proportion_of_informals = english.revision.informals / max(revision.words, 1)
12 |         - dict_words
13 |         - proportion_of_dictwords
14 |         - mispellings = revision.words - english.revision.dict_words
15 |         - proportion_of_misspellings = english.revision.misspellings / max(revision.words, 1)
16 |       - parent_revision
17 |       - ...
18 |     - french
19 |     - ... 
20 | 
21 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/classification/tests/test_counts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | 
 4 | from revscoring.scoring.statistics.classification.counts import Counts
 5 | 
 6 | 
 7 | def test_counts():
 8 |     c = Counts(
 9 |         [True, False],
10 |         [({'prediction': True}, True)] * 10 +
11 |         [({'prediction': True}, False)] * 20 +
12 |         [({'prediction': False}, False)] * 30 +
13 |         [({'prediction': False}, True)] * 40,
14 |         'prediction'
15 |     )
16 | 
17 |     print(c.format_str({}))
18 |     print(json.dumps(c.format_json({}), indent=2))
19 |     assert c.lookup("n") == 100
20 |     assert c.lookup("labels.true") == 50
21 |     assert c.lookup("predictions.false.false") == 30
22 | 
23 |     pickle.loads(pickle.dumps(c))
24 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/dictionary/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements a feature set based off of dictionary lookup.
 3 | 
 4 | .. autoclass:: revscoring.languages.features.Dictionary
 5 |     :members:
 6 |     :member-order: bysource
 7 | 
 8 | Supporting classes
 9 | ------------------
10 | 
11 | .. autoclass:: revscoring.languages.features.dictionary.Revision
12 |     :members:
13 |     :member-order: bysource
14 | 
15 | .. autoclass:: revscoring.languages.features.dictionary.Diff
16 |     :members:
17 |     :member-order: bysource
18 | """
19 | from .dictionary import Dictionary
20 | from .features import Diff, Revision
21 | from .util import utf16_cleanup, load_dict, MultiDictChecker
22 | 
23 | __all__ = [Dictionary, utf16_cleanup, load_dict, MultiDictChecker, Revision, Diff]
24 | 


--------------------------------------------------------------------------------
/revscoring/scoring/models/linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A collection of linear classifier models.
 3 | 
 4 | .. autoclass:: revscoring.scoring.models.LogisticRegression
 5 |     :members:
 6 |     :member-order:
 7 | """
 8 | import logging
 9 | 
10 | from sklearn.linear_model import LogisticRegression as sklearn_LR
11 | 
12 | from .sklearn import ProbabilityClassifier
13 | 
14 | logger = logging.getLogger(__name__)
15 | 
16 | 
17 | class LogisticRegression(ProbabilityClassifier):
18 |     """
19 |     Implements a Logistic Regression
20 |     """
21 |     Estimator = sklearn_LR
22 | 
23 |     def __init__(self, *args, label_weights=None, **kwargs):
24 |         if label_weights:
25 |             logger.warn("LogisticRegression does not support label_weights.")
26 |         super().__init__(*args, **kwargs)
27 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_hashing.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources.datasource import Datasource
 4 | from revscoring.datasources.meta import hashing
 5 | from revscoring.dependencies import solve
 6 | 
 7 | my_tokens = Datasource("my_tokens")
 8 | my_hashes = hashing.hash(my_tokens, n=10)
 9 | 
10 | 
11 | def test_hashing():
12 |     hashes = solve(
13 |         my_hashes, cache={my_tokens: [("one", "two"), "two", "three", "four"]})
14 | 
15 |     assert len(hashes) == 4
16 |     assert max(hashes) <= 10, str(max(hashes))
17 | 
18 |     hashes_again = solve(
19 |         my_hashes, cache={my_tokens: [("one", "two"), "two", "three", "four"]})
20 | 
21 |     assert hashes == hashes_again
22 | 
23 |     assert (pickle.loads(pickle.dumps(my_hashes)) ==
24 |             my_hashes)
25 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/classification/tests/test_threshold_optimization.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from revscoring.scoring.statistics.classification.threshold_optimization import \
 3 |     ThresholdOptimization
 4 | 
 5 | 
 6 | def test_threshold_optimization():
 7 |     ThresholdOptimization.parse("maximum precision @ recall >= 0.9")
 8 |     to = ThresholdOptimization.parse("maximum precision @ !recall >= 0.9")
 9 |     assert to.maximize is True
10 |     assert to.target_stat == "precision"
11 |     assert to.cond_stat == "!recall"
12 |     assert to.greater is True
13 |     assert to.cond_value == 0.9
14 | 
15 |     to = ThresholdOptimization.parse("minimum waffle_monster @ peet <= 0.001")
16 |     assert to.maximize is False
17 |     assert to.target_stat == "waffle_monster"
18 |     assert to.cond_stat == "peet"
19 |     assert to.greater is False
20 |     assert to.cond_value == 0.001
21 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Dictionary
 3 | ++++++++++
 4 | .. automodule :: revscoring.languages.features.dictionary
 5 | 
 6 | RegexMatches
 7 | ++++++++++++
 8 | .. automodule :: revscoring.languages.features.matches.regex_matches
 9 | 
10 | Stopwords
11 | +++++++++
12 | .. automodule :: revscoring.languages.features.stopwords
13 | 
14 | Stemmed
15 | +++++++
16 | .. automodule :: revscoring.languages.features.stemmed
17 | 
18 | SubstringMatches
19 | ++++++++++++++++
20 | .. automodule :: revscoring.languages.features.matches.substring_matches
21 | 
22 | """
23 | from .dictionary import Dictionary
24 | from .stemmed import Stemmed
25 | from .stopwords import Stopwords
26 | from .matches.regex_matches import RegexMatches
27 | from .matches.substring_matches import SubstringMatches
28 | 
29 | __all__ = [Dictionary, RegexMatches, Stemmed, Stopwords, SubstringMatches]
30 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: "CI"
 2 | on:
 3 |   pull_request:
 4 |     types: [opened, synchronize, reopened, ready_for_review]
 5 | 
 6 | jobs:
 7 |   ci-merge:
 8 |     runs-on: ubuntu-latest
 9 |     container:
10 |       image: docker-registry.wikimedia.org/bullseye:20221218
11 |       options: --user root
12 | 
13 |     steps:
14 |       - name: Check out repository code
15 |         uses: actions/checkout@v3
16 | 
17 |       - name: Setup Wikimedia Bullseye image
18 |         run: |
19 |           apt-get update -y
20 |           apt-get install python3-pip python3-dev python3-setuptools build-essential python3-enchant g++ gfortran git \
21 |           liblapack-dev	libopenblas-dev libenchant-2-2 wget -y
22 | 
23 |       - name: Run all
24 |         run: |
25 |           make pip-install
26 |           make setup-image
27 |           make run-tests
28 | 
29 |     outputs:
30 |       head-status: ${{ job.status }}
31 | 


--------------------------------------------------------------------------------
/tests/scoring/test_model_info.py:
--------------------------------------------------------------------------------
 1 | from pytest import raises
 2 | 
 3 | from revscoring.errors import ModelInfoLookupError
 4 | from revscoring.scoring.model_info import ModelInfo
 5 | 
 6 | 
 7 | def test_model_info():
 8 |     mi = ModelInfo(default_fields={'bar', 'foo'})
 9 |     mi['foo'] = 1
10 |     mi['bar'] = 2
11 |     mi['baz'] = 3
12 |     mi[True] = 1
13 | 
14 |     assert 'bar' in mi.format([''], formatting="json")
15 |     assert 'baz' not in mi.format([''], formatting="json")
16 |     mi.format(['true'], formatting="json")
17 | 
18 |     assert list(mi.keys()) == ['foo', 'bar', 'baz', True]
19 |     assert list(mi.format_json({}).keys()) == ['foo', 'bar']
20 | 
21 | 
22 | def test_model_info_error():
23 |     with raises(ModelInfoLookupError) as e:
24 |         mi = ModelInfo()
25 |         mi['baz'] = 3
26 |         mi[True] = 1
27 | 
28 |         mi.format(['false'])
29 |     assert e.value.args[0] == 'false'
30 | 


--------------------------------------------------------------------------------
/tests/features/bytes/tests/test_revision_oriented.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources import revision_oriented
 4 | from revscoring.dependencies import solve
 5 | from revscoring.features.bytes.revision_oriented import revision
 6 | 
 7 | length_change = revision.parent.length - revision.length
 8 | 
 9 | 
10 | def test_length():
11 |     cache = {revision_oriented.revision.parent.text: "I am ascii",
12 |              revision_oriented.revision.text: "地を南北に縦走する"}
13 | 
14 |     assert solve(revision.length, cache=cache) == 27
15 |     assert solve(revision.parent.length, cache=cache) == 10
16 |     assert solve(length_change, cache=cache) == -17
17 | 
18 |     assert pickle.loads(pickle.dumps(revision.length)) == revision.length
19 |     assert (pickle.loads(pickle.dumps(revision.parent.length)) ==
20 |             revision.parent.length)
21 |     assert pickle.loads(pickle.dumps(length_change)) == length_change
22 | 


--------------------------------------------------------------------------------
/tests/scoring/models/tests/test_model.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from pytest import mark
 3 | 
 4 | from revscoring.features import Feature
 5 | from revscoring.scoring.models.model import Classifier, Learned, Model
 6 | 
 7 | 
 8 | def test_model():
 9 |     m = Model([Feature("foo", returns=int)], version="0.0.1")
10 | 
11 |     assert m.info.lookup('version') == "0.0.1"
12 | 
13 | 
14 | def test_from_config():
15 |     config = {
16 |         'scorer_models': {
17 |             'test': {
18 |                 'module': "pytest.mark"
19 |             }
20 |         }
21 |     }
22 |     model = Model.from_config(config, 'test')
23 |     assert model == mark
24 | 
25 | 
26 | def test_learned_model():
27 |     model = Learned([Feature("foo", returns=int)])
28 |     assert model.trained is None
29 | 
30 | 
31 | def test_classifier():
32 |     model = Classifier([Feature("foo", returns=int)], [True, False])
33 |     assert 'statustics' not in model.info
34 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Dockerfile for building a Docker container.  See https://www.docker.com/
 2 | # Install wikimedia runscoring, with dependencies
 3 | # See: https://github.com/wikimedia/revscoring
 4 | 
 5 | # Build via docker build --rm -t nealmcb/revscoring:0.3 .
 6 | 
 7 | FROM jupyter/notebook
 8 | 
 9 | RUN DEBIAN_FRONTEND=noninteractive apt-get update && DEBIAN_FRONTEND=noninteractive apt-get upgrade -y
10 | 
11 | RUN DEBIAN_FRONTEND=noninteractive apt-get install -y \
12 |   python3-dev \
13 |   python3-numpy \
14 |   python3-scipy \
15 |   g++ \
16 |   gfortran \
17 |   liblapack-dev \
18 |   libopenblas-dev \
19 |   myspell-pt \
20 |   myspell-fa \
21 |   myspell-en-au \
22 |   myspell-en-gb \
23 |   myspell-en-us \
24 |   myspell-en-za \
25 |   myspell-fr \
26 |   myspell-es \
27 |   myspell-he \
28 |   hunspell-vi \
29 |   aspell-id
30 | 
31 | RUN pip3 install --user revscoring
32 | 
33 | RUN python3 -m nltk.downloader stopwords
34 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/stemmed/stemmed.py:
--------------------------------------------------------------------------------
 1 | from ....dependencies import DependentSet
 2 | from ....features import wikitext
 3 | from . import datasources, features
 4 | 
 5 | 
 6 | class Stemmed(DependentSet):
 7 |     """
 8 |     :Parameters:
 9 |         name : `str`
10 |             A name for the collection
11 |         stem_word : `func`
12 |             A function that, give a word, will return a stemmed version of that
13 |             word
14 |     """
15 | 
16 |     def __init__(self, name, stem_word):
17 |         super().__init__(name)
18 |         self.revision = features.Revision(
19 |             name + ".revision",
20 |             datasources.Revision(name + ".revision", stem_word,
21 |                                  wikitext.revision.datasources)
22 |         )
23 |         """
24 |         :class:`~revscoring.languages.features.stemmed.Revision` :
25 |         The base revision feature set.
26 |         """
27 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_dicts.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources.datasource import Datasource
 4 | from revscoring.datasources.meta import dicts
 5 | from revscoring.dependencies import solve
 6 | 
 7 | my_dict = Datasource("my_dict")
 8 | 
 9 | my_keys = dicts.keys(my_dict)
10 | my_values = dicts.values(my_dict)
11 | 
12 | 
13 | def test_dict_keys():
14 |     cache = {my_dict: {"foo": 1, "bar": 2}}
15 |     assert set(solve(my_keys, cache=cache)) == {"foo", "bar"}
16 |     cache = {my_dict: None}
17 |     assert set(solve(my_keys, cache=cache)) == set()
18 | 
19 |     assert pickle.loads(pickle.dumps(my_keys)) == my_keys
20 | 
21 | 
22 | def test_dict_values():
23 |     cache = {my_dict: {"foo": 1, "bar": 2}}
24 |     assert set(solve(my_values, cache=cache)) == {1, 2}
25 |     cache = {my_dict: None}
26 |     assert set(solve(my_values, cache=cache)) == set()
27 | 
28 |     assert pickle.loads(pickle.dumps(my_values)) == my_values
29 | 


--------------------------------------------------------------------------------
/revscoring/datasources/meta/timestamp.py:
--------------------------------------------------------------------------------
 1 | """
 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that
 3 | return `mwtypes.Timestamp` of the given string.
 4 | 
 5 | .. autoclass:: revscoring.datasources.meta.timestamp.Timestamp
 6 | """
 7 | import mwtypes
 8 | 
 9 | from ..datasource import Datasource
10 | 
11 | MW_REGISTRATION_EPOCH = '2006-01-01T00:00:00Z'
12 | 
13 | 
14 | class Timestamp(Datasource):
15 |     """
16 |     Generates a mwtypes.Timestamp of the given string
17 | 
18 |     :Parameters:
19 |         timestamp_str : `str`
20 |             Timestamp string in ISO format.
21 |         name : `str`
22 |             A name for the datasource.
23 |     """
24 | 
25 |     def __init__(self, timestamp_str, name=None):
26 |         super().__init__(name, self.process,
27 |                          depends_on=[timestamp_str])
28 | 
29 |     def process(self, timestamp_str):
30 |         return mwtypes.Timestamp(timestamp_str or MW_REGISTRATION_EPOCH)
31 | 


--------------------------------------------------------------------------------
/revscoring/utilities/check_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``revscoring check_model -h``
 3 | ::
 4 | 
 5 |     Compares a models construction environment snapshot to the current
 6 |     environment.
 7 | 
 8 |     Usage:
 9 |         check_model -h | --help
10 |         check_model <model-file> [--raise-exception]
11 | 
12 |     Options:
13 |         -h --help          Prints this documentation
14 |         <model-file>       Path to a model file
15 |         --raise-exception  Causes an error return state if there are
16 |                            inconsistencies between the current environment
17 |                            and the model's build environment.
18 | """
19 | import docopt
20 | 
21 | from ..scoring import Model, models
22 | 
23 | 
24 | def main(argv=None):
25 |     args = docopt.docopt(__doc__, argv=argv)
26 |     raise_exception = args['--raise-exception']
27 |     Model.load(models.open_file(args['<model-file>']),
28 |                error_on_env_check=raise_exception)
29 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/dictionary/dictionary.py:
--------------------------------------------------------------------------------
 1 | from ....dependencies import DependentSet
 2 | from ....features import wikitext
 3 | from . import datasources, features
 4 | 
 5 | 
 6 | class Dictionary(DependentSet):
 7 |     """
 8 |     :Parameters:
 9 |         name : `str`
10 |             A name for the collection
11 |         dictionary_check : `func`
12 |             A function that, given a word, performs a dictionary check and
13 |             returns True if the word exists.
14 |     """
15 | 
16 |     def __init__(self, name, dictionary_check):
17 |         super().__init__(name)
18 |         self.revision = features.Revision(
19 |             name + ".revision",
20 |             datasources.Revision(name + ".revision", dictionary_check,
21 |                                  wikitext.revision.datasources)
22 |         )
23 |         """
24 |         :class:`~revscoring.languages.features.dictionary.Revision` :
25 |         The base revision feature set.
26 |         """
27 | 


--------------------------------------------------------------------------------
/revscoring/datasources/datasource.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. autoclass:: revscoring.Datasource
 3 |     :member-order:
 4 |     :inherited-members:
 5 | """
 6 | from ..dependencies import Dependent
 7 | 
 8 | 
 9 | class Datasource(Dependent):
10 |     """
11 |     Represents a data source for generating features.  Unlike features,
12 |     datasources do not necessarily generate simple scalar values.
13 | 
14 |     :Parameters:
15 |         name : str
16 |             The name of the feature
17 |         process : `func`
18 |             A function that will generate a data value
19 |         depends_on : `list`(`hashable`)
20 |             An ordered list of dependencies that correspond
21 |             to the `*args` of `process`
22 |     """
23 | 
24 |     def __init__(self, *args, **kwargs):
25 |         super().__init__(*args, **kwargs)
26 | 
27 |     def __hash__(self):
28 |         return hash('datasource.' + self.name)
29 | 
30 |     def __str__(self):
31 |         return "datasource." + self.name
32 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/classification/tests/test_multilabel_counts.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | 
 4 | from revscoring.scoring.statistics.classification.counts import \
 5 |     MultilabelCounts
 6 | 
 7 | 
 8 | def test_counts():
 9 |     c = MultilabelCounts(
10 |         ["foo", "bar", "baz"],
11 |         [({'prediction': ["foo"]}, ["foo", "bar"])] * 10 +
12 |         [({'prediction': ["foo", "bar", "baz"]}, ["foo", "baz"])] * 20 +
13 |         [({'prediction': ["bar"]}, ["bar"])] * 30 +
14 |         [({'prediction': ["baz"]}, ["bar", "baz"])] * 40,
15 |         'prediction'
16 |     )
17 | 
18 |     print(c.format_str({}))
19 |     print(json.dumps(c.format_json({}), indent=2))
20 |     assert c.lookup("n") == 100
21 |     assert c.lookup("labels.foo") == 30
22 |     assert c.lookup("predictions.foo.true.false") == 0
23 |     assert c.lookup("predictions.foo.true.true") == 30
24 |     assert c.lookup("predictions.bar.false.true") == 20
25 | 
26 |     pickle.loads(pickle.dumps(c))
27 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/matches/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements a feature set based off of a set of regexes applied to strings.
 3 | 
 4 | .. autoclass:: revscoring.languages.features.RegexMatches
 5 |     :members:
 6 |     :member-order: bysource
 7 | 
 8 | Implements a feature set based off of a set of substrings applied to strings.
 9 | 
10 | .. autoclass:: revscoring.languages.features.SubstringMatches
11 |     :members:
12 |     :member-order: bysource
13 | 
14 | Supporting classes
15 | ------------------
16 | 
17 | .. autoclass:: revscoring.languages.features.matches.Revision
18 |     :members:
19 |     :member-order: bysource
20 | 
21 | .. autoclass:: revscoring.languages.features.matches.Diff
22 |     :members:
23 |     :member-order: bysource
24 | """
25 | from .features import Diff, Revision
26 | from .matches import Matches
27 | from .substring_matches import SubstringMatches
28 | from .regex_matches import RegexMatches
29 | 
30 | __all__ = [Matches, RegexMatches, SubstringMatches, Revision, Diff]
31 | 


--------------------------------------------------------------------------------
/tests/features/test_modifiers.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from math import log as math_log
 3 | 
 4 | from revscoring.dependencies import solve
 5 | from revscoring.features import modifiers
 6 | 
 7 | 
 8 | def test_log():
 9 |     log_five = modifiers.log(5)
10 | 
11 |     assert solve(log_five) == math_log(5)
12 | 
13 |     assert solve(pickle.loads(pickle.dumps(log_five))) == math_log(5)
14 | 
15 |     assert repr(log_five) == "<feature.log(5)>"
16 | 
17 | 
18 | def test_max():
19 | 
20 |     max_five_six_seven = modifiers.max(5, 6, 7)
21 | 
22 |     assert solve(max_five_six_seven) == 7
23 | 
24 |     assert solve(pickle.loads(pickle.dumps(max_five_six_seven))) == 7
25 | 
26 |     assert repr(max_five_six_seven) == "<feature.max(5, 6, 7)>"
27 | 
28 | 
29 | def test_min():
30 | 
31 |     min_five_six_seven = modifiers.min(5, 6, 7)
32 | 
33 |     assert solve(min_five_six_seven) == 5
34 | 
35 |     assert pickle.loads(pickle.dumps(min_five_six_seven)) == min_five_six_seven
36 | 
37 |     assert repr(min_five_six_seven) == "<feature.min(5, 6, 7)>"
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *~
 5 | ipython/.ipynb_checkpoints
 6 | .python-version
 7 | 
 8 | # C extensions
 9 | *.so
10 | 
11 | # Demo files
12 | demo_*.py
13 | 
14 | # Datasets & Models
15 | datasets/
16 | models/*.model
17 | 
18 | # Distribution / packaging
19 | .Python
20 | env/
21 | bin/
22 | build/
23 | docs/_build/
24 | develop-eggs/
25 | dist/
26 | eggs/
27 | lib/
28 | lib64/
29 | parts/
30 | sdist/
31 | var/
32 | _build/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | 
49 | # Translations
50 | *.mo
51 | 
52 | # Mr Developer
53 | .mr.developer.cfg
54 | .project
55 | .pydevproject
56 | 
57 | # Rope
58 | .ropeproject
59 | 
60 | # Django stuff:
61 | *.log
62 | *.pot
63 | 
64 | # Sphinx documentation
65 | doc/_build/
66 | 
67 | # pyCharm, IntelliJ
68 | *.idea
69 | *.iml
70 | 


--------------------------------------------------------------------------------
/docs/api_reference.rst:
--------------------------------------------------------------------------------
 1 | .. _api-reference:
 2 | 
 3 | :orphan:
 4 | 
 5 | revscoring package
 6 | ==================
 7 | 
 8 | Subpackages
 9 | -----------
10 | 
11 | .. toctree::
12 |     :maxdepth: 2
13 | 
14 |     revscoring.dependencies
15 |     revscoring.datasources
16 |     revscoring.datasources.meta
17 |     revscoring.datasources.revision_oriented
18 |     revscoring.extractors
19 |     revscoring.features
20 |     revscoring.features.meta
21 |     revscoring.features.modifiers
22 |     revscoring.features.revision_oriented
23 |     revscoring.features.bytes
24 |     revscoring.features.temporal
25 |     revscoring.features.wikibase
26 |     revscoring.features.wikitext
27 |     revscoring.languages
28 |     revscoring.languages.features
29 |     revscoring.scoring
30 |     revscoring.scoring.models
31 |     revscoring.scoring.statistics
32 |     revscoring.utilities
33 | 
34 | Submodules
35 | ----------
36 | 
37 | revscoring.errors module
38 | ------------------------
39 | 
40 | .. automodule:: revscoring.errors
41 |     :members:
42 |     :undoc-members:
43 | 


--------------------------------------------------------------------------------
/tests/features/test_functions.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from revscoring.datasources import Datasource
 3 | from revscoring.features.feature import Constant, Feature
 4 | from revscoring.features.feature_vector import FeatureVector
 5 | from revscoring.features.functions import trim, vectorize_values
 6 | from revscoring.features.modifiers import log, max
 7 | 
 8 | 
 9 | def test_trim():
10 | 
11 |     d1 = Datasource("derp1")
12 |     f1 = Feature("foobar1", returns=int)
13 |     f2 = Feature("foobar2", returns=int, depends_on=[d1])
14 |     c = Constant(value=5)
15 |     fv = FeatureVector("foobar3", returns=int, depends_on=[c])
16 | 
17 |     assert list(trim(f1)) == [f1]
18 |     assert list(trim([f1, f2, fv])) == [f1, f2, fv]
19 |     assert list(trim([f1, f2, f1 + f2, fv])) == [f1, f2, fv]
20 |     assert (list(trim(log(max(f1 - f2, 1)))) ==
21 |             [f1, f2])
22 | 
23 | 
24 | def test_vectorize_features():
25 | 
26 |     feature_values = [1, 2.0, [1.0, 2.0, 3.0], False]
27 |     assert (vectorize_values(feature_values) ==
28 |             [1, 2.0, 1.0, 2.0, 3.0, False])
29 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/matches/substring_matches.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements a feature set based off of list of regular expressions to match.
 3 | 
 4 | .. autoclass:: revscoring.languages.features.SubstringMatches
 5 |     :members:
 6 |     :member-order: bysource
 7 | """
 8 | from . import Matches
 9 | from ....datasources.meta import extractors
10 | 
11 | 
12 | class SubstringMatches(Matches):
13 |     """
14 |     :Parameters:
15 |         name : `str`
16 |             A name for the collection
17 |         substrings : `list` ( `str` )
18 |             A list of substrings to match.
19 |         exclusions : `list` ( `str` )
20 |             A list of substrings to explicitly not match
21 |     """
22 | 
23 |     def __init__(self, name, substrings, exclusions=None,
24 |                  text_preprocess=None):
25 |         matcher = extractors.trie(substrings,
26 |                                   exclusions=exclusions).process
27 |         super().__init__(name, matcher, substrings, exclusions,
28 |                          text_preprocess=text_preprocess)
29 | 


--------------------------------------------------------------------------------
/tests/scoring/test_util.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from revscoring.scoring import util
 3 | 
 4 | 
 5 | def test_pattern():
 6 |     assert (util.parse_pattern("'maximum filter_rate @ recall >= 0.9'.labels.true") ==  # noqa
 7 |             ["maximum filter_rate @ recall >= 0.9", "labels", "true"])
 8 |     assert (util.parse_pattern("'maximum filter_rate @ recall >= 0.9'.'labels'.true") ==  # noqa
 9 |             ["maximum filter_rate @ recall >= 0.9", "labels", "true"])
10 |     assert (util.parse_pattern("'foo\"bar\"'.buz") ==  # noqa
11 |             ["foo\"bar\"", "buz"])
12 | 
13 | 
14 | def test_treeify():
15 |     paths = (util.parse_pattern(p)
16 |              for p in ['foo.bar.baz', 'foo.bar.buz', 'foo.bar', 'bum'])
17 |     assert (util.treeify(paths) ==
18 |             {'foo': {'bar': {'baz': {}, 'buz': {}}}, 'bum': {}})
19 |     assert util.treeify([util.parse_pattern("")]) == {}
20 | 
21 | 
22 | def test_dict_lookup():
23 |     r = util.dict_lookup({'foo': {'bar': {'baz': 1}}, 'bum': {'derp': 2}},
24 |                          {'foo': {'bar': {}}})
25 |     assert r == {'foo': {'bar': {'baz': 1}}}
26 | 


--------------------------------------------------------------------------------
/revscoring/scoring/models/svc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A collection of Support Vector Machine type classifier models.
 3 | 
 4 | .. autoclass:: revscoring.scoring.models.LinearSVC
 5 |     :members:
 6 |     :member-order:
 7 | 
 8 | .. autoclass:: revscoring.scoring.models.RBFSVC
 9 |     :members:
10 |     :member-order:
11 | 
12 | .. autoclass:: revscoring.scoring.models.SVC
13 |     :members:
14 |     :member-order:
15 | 
16 | """
17 | from sklearn import svm
18 | 
19 | from .sklearn import ProbabilityClassifier
20 | 
21 | 
22 | class SVC(ProbabilityClassifier):
23 |     """
24 |     Implements a Support Vector Classifier model.
25 |     """
26 |     Estimator = svm.SVC
27 |     BASE_PARAMS = {'probability': True}
28 | 
29 | 
30 | class LinearSVC(SVC):
31 |     """
32 |     Implements a Support Vector Classifier model with a Linear kernel.
33 |     """
34 |     BASE_PARAMS = {'probability': True, 'kernel': "linear"}
35 | 
36 | 
37 | class RBFSVC(SVC):
38 |     """
39 |     Implements a Support Vector Classifier model with an RBF kernel.
40 |     """
41 |     BASE_PARAMS = {'probability': True, 'kernel': "rbf"}
42 | 


--------------------------------------------------------------------------------
/revscoring/datasources/meta/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Meta-Features are classes that extend :class:`~revscoring.Datasource` and
 3 | implement common operations on other :class:`~revscoring.Datasource`.
 4 | 
 5 | dicts
 6 | +++++
 7 | .. automodule:: revscoring.datasources.meta.dicts
 8 | 
 9 | extractors
10 | ++++++++++
11 | .. automodule:: revscoring.datasources.meta.extractors
12 | 
13 | filters
14 | +++++++
15 | .. automodule:: revscoring.datasources.meta.filters
16 | 
17 | frequencies
18 | +++++++++++
19 | .. automodule:: revscoring.datasources.meta.frequencies
20 | 
21 | gramming
22 | ++++++++
23 | .. automodule:: revscoring.datasources.meta.gramming
24 | 
25 | hashing
26 | +++++++
27 | .. automodule:: revscoring.datasources.meta.hashing
28 | 
29 | indexable
30 | +++++++++
31 | .. automodule:: revscoring.datasources.meta.indexable
32 | 
33 | mappers
34 | +++++++
35 | .. automodule:: revscoring.datasources.meta.mappers
36 | 
37 | selectors
38 | +++++++++
39 | .. automodule:: revscoring.datasources.meta.selectors
40 | 
41 | timestamp
42 | +++++++++
43 | .. automodule:: revscoring.datasources.meta.timestamp
44 | """
45 | 


--------------------------------------------------------------------------------
/revscoring/scoring/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Scoring is what the `revscoring` library was designed to do.  The basics of
 3 | scoring are :class:`revscoring.Model` that implement
 4 | :func:`~revscoring.Model.score` and :class:`revscoring.scoring.Statistics` that
 5 | are :func:`~revscoring.scoring.Statistics.fit` using the scores generated by a
 6 | :class:`revscoring.Model`.  Prediction models are fragile, so models keep track
 7 | of their :class:`revscoring.scoring.Environment` and you can
 8 | :func:`revscoring.scoring.Environment.check` them against the current
 9 | environment.
10 | 
11 | See :mod:`revscoring.scoring.models` and :mod:`revscoring.scoring.statistics`
12 | for more information.
13 | 
14 | .. autoclass:: revscoring.Model
15 |     :members:
16 | 
17 | .. autoclass:: revscoring.scoring.ModelInfo
18 |     :members:
19 | 
20 | .. autoclass:: revscoring.scoring.Environment
21 |     :members:
22 | """
23 | from .environment import Environment
24 | from .model_info import ModelInfo
25 | from .models.model import Model
26 | from .statistics.statistics import Statistics
27 | 
28 | __all__ = [Model, ModelInfo, Statistics, Environment]
29 | 


--------------------------------------------------------------------------------
/revscoring/scoring/models/naive_bayes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A collection of Naive Bayes type classifier models.
 3 | 
 4 | .. autoclass:: revscoring.scoring.models.GaussianNB
 5 |     :members:
 6 |     :member-order:
 7 | 
 8 | .. autoclass:: revscoring.scoring.models.MultinomialNB
 9 |     :members:
10 |     :member-order:
11 | 
12 | .. autoclass:: revscoring.scoring.models.BernoulliNB
13 |     :members:
14 |     :member-order:
15 | """
16 | import logging
17 | 
18 | from sklearn import naive_bayes
19 | 
20 | from .sklearn import ProbabilityClassifier
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | class NaiveBayes(ProbabilityClassifier):
26 |     pass
27 | 
28 | 
29 | class GaussianNB(NaiveBayes):
30 |     """
31 |     Implements a Gaussian Naive Bayes model
32 |     """
33 |     Estimator = naive_bayes.GaussianNB
34 | 
35 | 
36 | class MultinomialNB(NaiveBayes):
37 |     """
38 |     Implements a Multinomial Naive Bayes model
39 |     """
40 |     Estimator = naive_bayes.MultinomialNB
41 | 
42 | 
43 | class BernoulliNB(NaiveBayes):
44 |     """
45 |     Implements a Bernoulli Naive Bayes model
46 |     """
47 |     Estimator = naive_bayes.BernoulliNB
48 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | pip-install: requirements.txt test-requirements.txt
 2 | 	pip install -r requirements.txt
 3 | 	pip install -r test-requirements.txt
 4 | 
 5 | .PHONY: run-tests
 6 | run-tests:
 7 | 	python3 -m pytest tests/ -v --cov
 8 | 
 9 | .PHONY: setup-image
10 | setup-image:
11 | 	apt-get install \
12 | 	hunspell-ar \
13 | 	aspell-bn \
14 | 	aspell-el \
15 | 	hunspell-id \
16 | 	hunspell-en-us \
17 | 	aspell-is \
18 | 	aspell-pl \
19 | 	aspell-ro \
20 | 	aspell-sv \
21 | 	aspell-ta \
22 | 	aspell-uk \
23 | 	hunspell-cs \
24 | 	hunspell-de-at \
25 | 	hunspell-de-ch \
26 | 	hunspell-de-de \
27 | 	hunspell-es \
28 | 	hunspell-et \
29 | 	myspell-fa \
30 | 	hunspell-fr \
31 | 	hunspell-he \
32 | 	hunspell-hr \
33 | 	aspell-hu \
34 | 	hunspell-lv \
35 | 	myspell-nb \
36 | 	hunspell-nl \
37 | 	hunspell-pt-pt \
38 | 	hunspell-pt-br \
39 | 	hunspell-ru \
40 | 	hunspell-hr \
41 | 	hunspell-bs \
42 | 	hunspell-ca \
43 | 	hunspell-en-au \
44 | 	hunspell-en-us \
45 | 	hunspell-en-gb \
46 | 	hunspell-eu \
47 | 	hunspell-gl \
48 | 	hunspell-it \
49 | 	hunspell-hi \
50 | 	hunspell-sr \
51 | 	hunspell-vi \
52 | 	-y
53 | 	python3 -m nltk.downloader omw sentiwordnet stopwords wordnet
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Aaron Halfaker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/revscoring/datasources/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module implements a set of
 3 | :class:`~revscoring.Datasource`
 4 | processors that represent the input data for extracting
 5 | :class:`~revscoring.Feature` values.  Just like
 6 | :class:`~revscoring.Feature` and other
 7 | :class:'~revscoring.Dependent' processors,
 8 | :class:`~revscoring.Datasource` processors are tended to
 9 | be :func:`~revscoring.dependencies.solve`'d as dependencies. The
10 | provided datasources are split conceptually into a set of modules.  Currently,
11 | there is one module: :mod:`~revscoring.datasources.revision_oriented`.
12 | 
13 | Meta-datasources
14 | ++++++++++++++++
15 | Meta-Features are classes that extend :class:`~revscoring.Datasource` and
16 | implement common operations on :class:`~revscoring.Datasource` like
17 | :class:`~revscoring.datasources.meta.filters.filter` and
18 | :class:`~revscoring.datasources.meta.mappers.map`.
19 | See :mod:`revscoring.datasources.meta` for the full list.
20 | 
21 | Base classes
22 | ++++++++++++
23 | .. automodule:: revscoring.datasources.datasource
24 | 
25 | 
26 | 
27 | 
28 | """
29 | from .datasource import Datasource
30 | 
31 | __all__ = [Datasource]
32 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/stopwords/stopwords.py:
--------------------------------------------------------------------------------
 1 | from ....dependencies import DependentSet
 2 | from ....features import wikitext
 3 | from . import datasources, features
 4 | 
 5 | 
 6 | class Stopwords(DependentSet):
 7 |     """
 8 |     :Parameters:
 9 |         name : `str`
10 |             A name for the collection
11 |         stopword_set : `set` ( `str` )
12 |             A set of stopwords
13 |     """
14 | 
15 |     def __init__(self, name, stopword_set):
16 |         super().__init__(name)
17 |         word_is_stopword = WordIsInStopwordSet(stopword_set)
18 | 
19 |         self.revision = features.Revision(
20 |             name + ".revision",
21 |             datasources.Revision(name + ".revision", word_is_stopword,
22 |                                  wikitext.revision.datasources)
23 |         )
24 |         """
25 |         :class:`~revscoring.languages.features.stopwords.Revision` :
26 |         The base revision feature set.
27 |         """
28 | 
29 | 
30 | class WordIsInStopwordSet:
31 | 
32 |     def __init__(self, stopword_set, cleanup=None):
33 |         self.stopword_set = stopword_set
34 | 
35 |     def __call__(self, word):
36 |         return word.lower() in self.stopword_set
37 | 


--------------------------------------------------------------------------------
/revscoring/features/temporal/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This features module provides access to features of the bytes of content in
 3 | revisions.
 4 | 
 5 | .. autodata:: revscoring.features.temporal.revision
 6 | 
 7 | Supporting classes
 8 | ++++++++++++++++++
 9 | 
10 | .. autoclass:: revscoring.features.temporal.Revision
11 |     :members:
12 |     :member-order: bysource
13 | 
14 | .. autoclass:: revscoring.features.temporal.ParentRevision
15 |     :members:
16 |     :member-order: bysource
17 | 
18 | .. autoclass:: revscoring.features.temporal.LastUserRevision
19 |     :members:
20 |     :member-order: bysource
21 | 
22 | .. autoclass:: revscoring.features.temporal.PageCreation
23 |     :members:
24 |     :member-order: bysource
25 | 
26 | .. autoclass:: revscoring.features.temporal.Page
27 |     :members:
28 |     :member-order: bysource
29 | 
30 | .. autoclass:: revscoring.features.temporal.User
31 |     :members:
32 |     :member-order: bysource
33 | 
34 | """
35 | from .revision_oriented import (LastUserRevision, Page, PageCreation,
36 |                                 ParentRevision, Revision, User, revision)
37 | 
38 | __all__ = [revision, Revision, ParentRevision, LastUserRevision, PageCreation,
39 |            Page, User]
40 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/matches/regex_matches.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Implements a feature set based off of list of regular expressions to match.
 3 | 
 4 | .. autoclass:: revscoring.languages.features.RegexMatches
 5 |     :members:
 6 |     :member-order: bysource
 7 | """
 8 | from . import Matches
 9 | from ....datasources.meta import extractors
10 | 
11 | 
12 | class RegexMatches(Matches):
13 |     """
14 |     :Parameters:
15 |         name : `str`
16 |             A name for the collection
17 |         regexes : `list` ( `str` )
18 |             A list of regex patterns to match.
19 |         exclusions : `list` ( `str` )
20 |             A list of terms to explicitly not match
21 |         wrapping : `tuple` ( `str`, `str` )
22 |             Insert these characters around matches in the regular expression
23 |     """
24 | 
25 |     def __init__(self, name, regexes, exclusions=None,
26 |                  wrapping=(r'\b', r'\b'), text_preprocess=None):
27 |         matcher = extractors.regex(regexes, wrapping=wrapping,
28 |                                    exclusions=exclusions).process
29 |         super().__init__(name, matcher, regexes, exclusions,
30 |                          text_preprocess)
31 | 


--------------------------------------------------------------------------------
/tests/features/meta/tests/test_vectorizers.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources import Datasource
 4 | from revscoring.dependencies import solve
 5 | from revscoring.features.meta import vectorizers
 6 | 
 7 | my_dict = Datasource("my_dict")
 8 | 
 9 | 
10 | class KeysDict(Datasource):
11 | 
12 |     def __init__(self, name, keys):
13 |         super().__init__(name)
14 |         self._keys = keys
15 | 
16 |     def keys(self):
17 |         return self._keys
18 | 
19 | 
20 | my_keys_dict = KeysDict("my_keys_dict", ["a", "b", "c"])
21 | 
22 | 
23 | def test_vectorize():
24 |     my_vector = vectorizers.vectorize(
25 |         my_dict, ["a", "b", "c"], returns=int)
26 | 
27 |     assert (solve(my_vector, cache={my_dict: {"a": 5}}) ==
28 |             [5, 0, 0])
29 |     assert (solve(my_vector, cache={my_dict: {"d": 5}}) ==
30 |             [0, 0, 0])
31 |     assert (solve(my_vector, cache={my_dict: {"a": 1, "b": 2, "c": 3}}) ==
32 |             [1, 2, 3])
33 | 
34 |     assert pickle.loads(pickle.dumps(my_vector)) == my_vector
35 | 
36 |     my_keys_vector = vectorizers.vectorize(my_keys_dict, returns=int)
37 | 
38 |     assert (solve(my_keys_vector, cache={my_keys_dict: {"a": 1, "b": 2, "c": 3}}) ==
39 |             [1, 2, 3])
40 | 


--------------------------------------------------------------------------------
/revscoring/features/feature_vector.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. autoclass:: revscoring.FeatureVector
 3 |     :members:
 4 | """
 5 | from revscoring.features import Feature
 6 | 
 7 | 
 8 | class FeatureVector(Feature):
 9 |     """
10 |     Represents a vector of predictive features.
11 | 
12 |     :Parameters:
13 |         name : str
14 |             The name of the feature
15 |         process : `func`
16 |             A function that will generate a feature value
17 |         returns : `type`
18 |             A type to compare the return vector of this function to.
19 |         dependencies : `list`(`hashable`)
20 |             An ordered list of dependencies that correspond
21 |             to the `*args` of `process`
22 |     """
23 | 
24 |     def validate(self, vector):
25 |         for i, value in enumerate(vector):
26 |             if not isinstance(value, self.returns):
27 |                 raise ValueError(
28 |                     "Expected {0}, but got {1} instead at position {2}."
29 |                     .format(self.returns, type(value), i))
30 | 
31 |         return vector
32 | 
33 |     def __hash__(self):
34 |         return hash('feature_vector.' + self.name)
35 | 
36 |     def __str__(self):
37 |         return "feature_vector." + self.name
38 | 


--------------------------------------------------------------------------------
/revscoring/features/modifiers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Modifiers provide convenient mechanisms for modifying and combining
 3 | :class:`revscoring.Feature` and constant values into new
 4 | :class:`revscoring.Feature`.
 5 | 
 6 | .. autofunction:: revscoring.features.modifiers.log
 7 | 
 8 | ----
 9 | 
10 | .. autofunction:: revscoring.features.modifiers.min
11 | .. autofunction:: revscoring.features.modifiers.max
12 | 
13 | ----
14 | 
15 | .. autofunction:: revscoring.features.modifiers.add
16 | .. autofunction:: revscoring.features.modifiers.sub
17 | .. autofunction:: revscoring.features.modifiers.mul
18 | .. autofunction:: revscoring.features.modifiers.div
19 | 
20 | ----
21 | 
22 | .. autofunction:: revscoring.features.modifiers.eq
23 | .. autofunction:: revscoring.features.modifiers.ne
24 | .. autofunction:: revscoring.features.modifiers.gt
25 | .. autofunction:: revscoring.features.modifiers.lt
26 | .. autofunction:: revscoring.features.modifiers.ge
27 | .. autofunction:: revscoring.features.modifiers.le
28 | 
29 | """
30 | from .feature import (add, and_, div, eq, ge, gt, le, log, lt, max, min, mul,
31 |                       ne, not_, or_, sub)
32 | 
33 | __all__ = [add, div, eq, ge, gt, le, log, lt, max, min, mul, ne, sub, and_,
34 |            or_, not_]
35 | 


--------------------------------------------------------------------------------
/tests/scoring/test_labels.py:
--------------------------------------------------------------------------------
 1 | from revscoring.scoring.labels import Binarizer, ClassVerifier
 2 | 
 3 | 
 4 | def test_class_verifier():
 5 |     label_set = [True, False]
 6 |     cv = ClassVerifier(label_set)
 7 |     labels = [True, True, False, True, False, False]
 8 |     cv.check_label_consistency(labels)
 9 |     normalized_labels = cv.normalize(True)
10 |     assert normalized_labels
11 | 
12 | 
13 | def test_binarizer():
14 |     label_set = ['A', 'B', 'C', 'D']
15 |     labels = [['A', 'B'], ['B', 'D'], ['A', 'B', 'C', 'D'], ['B', 'C']]
16 |     binarizer = Binarizer(label_set)
17 |     binarizer.check_label_consistency(labels)
18 |     normalized_labels = binarizer.normalize(labels[1])
19 |     normalized_labels_actual = [0, 1, 0, 1]
20 |     assert normalized_labels == normalized_labels_actual
21 | 
22 |     denormalized_labels = binarizer.denormalize(normalized_labels_actual)
23 |     assert denormalized_labels == labels[1]
24 | 
25 | 
26 | def test_label_weights_normalizer():
27 |     label_weights = {'A': 0.4, 'B': 0.6}
28 |     label_set = ['A', 'B']
29 |     binarizer = Binarizer(label_set)
30 |     expected_label_weights = [{0: 1, 1: 0.4}, {0: 1, 1: 0.6}]
31 |     assert expected_label_weights == \
32 |         binarizer.normalize_weights(label_weights)
33 | 


--------------------------------------------------------------------------------
/tests/languages/util.py:
--------------------------------------------------------------------------------
 1 | from revscoring.datasources import revision_oriented as ro
 2 | from revscoring.dependencies import solve
 3 | 
 4 | 
 5 | def simple_eq(a, b):
 6 |     return a == b
 7 | 
 8 | 
 9 | def compare_extraction(extractor, examples, counter_examples,
10 |                        lwrap="", rwrap="", eq=simple_eq):
11 |     def process(text):
12 |         return solve(extractor, cache={ro.revision.text: text})
13 | 
14 |     for example in examples:
15 |         wrapped = lwrap + example + rwrap
16 |         assert eq(process(wrapped), [example]), \
17 |               " ".join([repr(wrapped), str(process(wrapped)), str([example])])
18 |         assert eq(process(
19 |             "Sentence " +
20 |             wrapped +
21 |             " sandwich."), [example])
22 |         assert eq(process("Sentence end " + wrapped + "."), [example])
23 |         assert eq(process(wrapped + " start of sentence."), [example])
24 | 
25 |     for example in counter_examples:
26 |         wrapped = lwrap + example + rwrap
27 |         assert process(wrapped) == [], process(wrapped)
28 |         assert process("Sentence " + wrapped + " sandwich.") == []
29 |         assert process("Sentence end " + wrapped + ".") == []
30 |         assert process(wrapped + " start of sentence.") == []
31 | 


--------------------------------------------------------------------------------
/revscoring/scoring/statistics/statistics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. autoclass:: revscoring.scoring.Statistics
 3 |     :members:
 4 |     :inherited-members:
 5 |     :member-order:
 6 | """
 7 | import logging
 8 | 
 9 | from ..model_info import ModelInfo
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class Statistics(ModelInfo):
15 |     """
16 |     Construct a set of Statistics.  Instances of this class work like a
17 |     `dict` of statistical values once
18 |     :func:`revscoring.scoring.Statistics.fit` is called.
19 |     """
20 |     def __init__(self, *args, **kwargs):
21 |         super().__init__(*args, **kwargs)
22 |         self.fitted = False
23 | 
24 |     def fit(self, score_labels):
25 |         """
26 |         Fit to scores and labels.
27 | 
28 |         :Parameters:
29 |             score_labels : [( `dict`, `mixed` )]
30 |                 A collection of scores-label pairs generated using
31 |                 :class:`revscoring.Model.score`.  Note that fitting is usually
32 |                 done using data withheld during model training
33 |         """
34 |         self.fitted = True
35 | 
36 |     def format_str(self, path_tree, **kwargs):
37 |         raise NotImplementedError()
38 | 
39 |     def format_json(self, path_tree, **kwargs):
40 |         raise NotImplementedError()
41 | 


--------------------------------------------------------------------------------
/revscoring/features/wikitext/datasources/revision_oriented.py:
--------------------------------------------------------------------------------
 1 | from revscoring.dependencies import DependentSet
 2 | 
 3 | from . import edit, parsed, sentences, tokenized
 4 | 
 5 | 
 6 | class BaseRevision(DependentSet):
 7 | 
 8 |     def __init__(self, name, revision_datasources):
 9 |         super().__init__(name)
10 |         self.text = revision_datasources.text
11 | 
12 |         if hasattr(revision_datasources, "parent"):
13 |             self.parent = Revision(
14 |                 name + ".parent",
15 |                 revision_datasources.parent
16 |             )
17 | 
18 | 
19 | class Revision(parsed.Revision, sentences.Revision, tokenized.Revision,
20 |                BaseRevision):
21 | 
22 |     def __init__(self, name, revision_datasources):
23 |         # Initializes all of the Revision datasources
24 |         super().__init__(name, revision_datasources)
25 | 
26 |         # Initializes the diff using the Revision datasources
27 |         if hasattr(revision_datasources, "diff"):
28 |             self.diff = Diff(name + ".diff", self)
29 | 
30 | 
31 | class BaseDiff(DependentSet):
32 | 
33 |     def __init__(self, name, revision):
34 |         super().__init__(name)
35 |         self.revision = revision
36 | 
37 | 
38 | class Diff(edit.Diff, sentences.Diff, tokenized.Diff, BaseDiff):
39 |     pass
40 | 


--------------------------------------------------------------------------------
/revscoring/datasources/meta/indexable.py:
--------------------------------------------------------------------------------
 1 | """
 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that
 3 | return `list`'s and `tuple`'s
 4 | 
 5 | .. autoclass:: revscoring.datasources.meta.indexable.index
 6 | 
 7 | """
 8 | from ..datasource import Datasource
 9 | 
10 | 
11 | class index(Datasource):
12 |     """
13 |     Generates a datasource that returns the value that appears at `i`
14 | 
15 |     :Parameters:
16 |         i : `int`
17 |             The index of a value to return
18 |         default : `mixed`
19 |             The value to return if no value exists at `i`.  If not specified,
20 |             an IndexError will be raised
21 |         name : `str`
22 |             A name for the new datasource.
23 |     """
24 | 
25 |     def __init__(self, i, datasources, default=NotImplemented, name=None):
26 |         name = self._format_name(name, [i, default])
27 |         self.i = int(i)
28 |         self.default = default
29 |         super().__init__(name, self.process,
30 |                          depends_on=[datasources])
31 | 
32 |     def process(self, indexable):
33 |         try:
34 |             return indexable[self.i]
35 |         except IndexError:
36 |             if self.default is not NotImplemented:
37 |                 return self.default
38 |             else:
39 |                 raise
40 | 


--------------------------------------------------------------------------------
/revscoring/features/bytes/revision_oriented.py:
--------------------------------------------------------------------------------
 1 | from revscoring.datasources import revision_oriented
 2 | from revscoring.dependencies import DependentSet
 3 | 
 4 | from ..meta import aggregators
 5 | from . import datasources
 6 | 
 7 | name = "bytes.revision"
 8 | 
 9 | 
10 | class Revision(DependentSet):
11 | 
12 |     def __init__(self, name, revision_datasources):
13 |         super().__init__(name)
14 |         self.length = aggregators.len(
15 |             revision_datasources.bytes,
16 |             name=name + ".length"
17 |         )
18 |         "`int` : The length of the revision content in bytes"
19 | 
20 |         if hasattr(revision_datasources, "parent"):
21 |             self.parent = Revision(
22 |                 name + ".parent",
23 |                 revision_datasources.parent
24 |             )
25 |             """
26 |             :class:`revscoring.features.bytes.Revision` : The
27 |             parent (aka "previous") revision of the page.
28 |             """
29 | 
30 | 
31 | revision = Revision(name,
32 |                     datasources.Revision(name, revision_oriented.revision))
33 | """
34 | Represents the base revision of interest.  Implements this a basic structure:
35 | 
36 | * revision: :class:`~revscoring.features.bytes.Revision`
37 |     * parent: :class:`~revscoring.features.bytes.Revision`
38 | """
39 | 


--------------------------------------------------------------------------------
/revscoring/scoring/statistics/classification/label_thresholds.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from collections import OrderedDict
 3 | 
 4 | from ... import util
 5 | from ...model_info import ModelInfo
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class LabelThresholds(ModelInfo):
11 | 
12 |     def lookup(self, path):
13 |         if len(path) > 0:
14 |             key = path[0]
15 |             if len(path[1:]) > 0:
16 |                 logger.warn("Ignoring path at {0!r}".format(path[1:]))
17 |             return self[key]
18 |         else:
19 |             return self
20 | 
21 |     def format_str(self, path_tree, **kwargs):
22 |         formatted = "thresholds:\n"
23 |         for label in path_tree.keys() or self.keys():
24 |             sub_tree = path_tree.get(label, {})
25 |             formatted += util.tab_it_in(repr(label))
26 |             table_str = self[label].format_str(sub_tree, **kwargs)
27 |             formatted += util.tab_it_in(table_str, 2)
28 |             formatted += "\n"
29 |         return formatted
30 | 
31 |     def format_json(self, path_tree, **kwargs):
32 |         doc = OrderedDict
33 |         for label in path_tree.keys() or self.keys():
34 |             sub_tree = path_tree.get(label, {})
35 |             doc[label] = self[label].format_json(sub_tree, **kwargs)
36 |         return doc
37 | 


--------------------------------------------------------------------------------
/revscoring/utilities/model_info.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``revscoring model_info -h``
 3 | ::
 4 | 
 5 |     Prints formatted information about a model file.
 6 | 
 7 | 
 8 |     Usage:
 9 |         module_info -h | --help
10 |         module_info <model-file> [<path>...] [--formatting=<type>]
11 | 
12 |     Options:
13 |         -h --help            Prints this documentation
14 |         <model-file>         Path to a model file
15 |         <path>               A model information path.  If no path is provided,
16 |                              all default fields will be in the output.
17 |         --formatting=<type>  What format to output the information?  "str" or
18 |                              "json" [default: str]
19 | """
20 | import json
21 | 
22 | import docopt
23 | 
24 | from ..scoring import Model, models
25 | 
26 | 
27 | def main(argv=None):
28 |     args = docopt.docopt(__doc__, argv=argv)
29 |     scoring_model = Model.load(models.open_file(args['<model-file>']))
30 |     paths = args['<path>']
31 |     formatting = args['--formatting']
32 | 
33 |     run(scoring_model, paths, formatting)
34 | 
35 | 
36 | def run(scoring_model, paths, formatting):
37 |     formatted = scoring_model.info.format(paths, formatting=formatting)
38 |     if formatting == "json":
39 |         formatted = json.dumps(formatted, indent=2)
40 | 
41 |     print(formatted)
42 | 


--------------------------------------------------------------------------------
/tests/utilities/test_util.py:
--------------------------------------------------------------------------------
 1 | from revscoring.utilities.util import (read_labels_and_population_rates,
 2 |                                        read_labels_config)
 3 | 
 4 | 
 5 | def test_plain_labels():
 6 |     labels, label_weights, population_rates = read_labels_and_population_rates(
 7 |         "true,false", ["true=5"], ["true=0.1", "false=0.9"], None)
 8 | 
 9 |     assert labels == [True, False]
10 |     assert label_weights == {True: 5}
11 |     assert population_rates == {True: 0.1, False: 0.9}
12 | 
13 | 
14 | def test_pop_rates_labels():
15 |     labels, label_weights, population_rates = read_labels_and_population_rates(
16 |          None, ["true=5"], ["true=0.1", "false=0.9"], None)
17 | 
18 |     assert labels == [True, False]
19 |     assert label_weights == {True: 5}
20 |     assert population_rates == {True: 0.1, False: 0.9}
21 | 
22 | 
23 | def test_labels_config():
24 |     labels_config = {
25 |         'name': "enwiki damaging",
26 |         'labels': [
27 |             {'value': True, 'weight': 5, 'population_rate': 0.1},
28 |             {'value': False, 'population_rate': 0.9}
29 |         ]}
30 |     labels, label_weights, population_rates = read_labels_config(labels_config)
31 | 
32 |     assert labels == [True, False]
33 |     assert label_weights == {True: 5}
34 |     assert population_rates == {True: 0.1, False: 0.9}
35 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_python_package.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     paths:
 8 |       - 'revscoring/about.py'
 9 | 
10 | jobs:
11 |   build-n-publish:
12 |     name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@master
16 |       - name: Set up Python 3.7
17 |         uses: actions/setup-python@v3
18 |         with:
19 |           python-version: "3.7"
20 |       - name: Install pypa/build
21 |         run: >-
22 |           python -m pip install build --user
23 |       - name: Build a binary wheel and a source tarball
24 |         run: >-
25 |           python -m
26 |           build
27 |           --sdist
28 |           --wheel
29 |           --outdir dist/
30 |           .
31 | 
32 |       - name: Publish distribution 📦 to Test PyPI
33 |         uses: pypa/gh-action-pypi-publish@release/v1
34 |         with:
35 |           password: ${{ secrets.PYPI_TEST_TOKEN }}
36 |           repository_url: https://test.pypi.org/legacy/
37 | 
38 |       - name: Publish distribution 📦 to PyPI
39 |         uses: pypa/gh-action-pypi-publish@release/v1
40 |         with:
41 |           user: scoring-internal
42 |           password: ${{ secrets.PYPI_PASS }}
43 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/dictionary/util.py:
--------------------------------------------------------------------------------
 1 | import enchant
 2 | 
 3 | REPLACEMENT_CHAR = "\uFFFD"
 4 | 
 5 | 
 6 | def utf16_cleanup(token):
 7 |     """
 8 |     Removes chars that can't be represented in two bytes.  This is important
 9 |     since `enchant` will expect that all strings passed to it are two-byte
10 |     chars and print "This UTF-8 encoding can't convert to UTF-16:" if it can't
11 |     decode.  This prevents that problem.
12 |     See https://github.com/rfk/pyenchant/issues/58
13 |     """
14 |     return "".join(c if ord(c) < 2 ** 16 else REPLACEMENT_CHAR
15 |                    for c in token)
16 | 
17 | 
18 | def load_dict(dict_name, target_package):
19 |     try:
20 |         return enchant.Dict(dict_name)
21 |     except enchant.errors.DictNotFoundError:
22 |         raise ImportError(
23 |             ("No enchant-compatible dictionary found for {0!r}.  " +
24 |              "Consider installing {1!r}").format(dict_name, target_package))
25 | 
26 | 
27 | class MultiDictChecker:
28 |     """
29 |     Implements a check() method that will iterate through dictionaries looking
30 |     for any correct spelling.
31 |     """
32 | 
33 |     def __init__(self, *dicts):
34 |         self.dicts = dicts
35 | 
36 |     def check(self, word):
37 |         for dict in self.dicts:
38 |             if dict.check(word):
39 |                 return True
40 |         return False
41 | 


--------------------------------------------------------------------------------
/revscoring/datasources/meta/hashing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that returns
 3 | a list of strings (i.e. "tokens") and produces a list of ngram/skipgram
 4 | sequences.
 5 | 
 6 | .. autoclass:: revscoring.datasources.meta.hashing.hash
 7 | 
 8 | """
 9 | import json
10 | 
11 | import mmh3
12 | 
13 | from ..datasource import Datasource
14 | 
15 | 
16 | class hash(Datasource):
17 |     """
18 |     Converts a sequence of items into a sequence of portable hashes (`int`)
19 |     based on the result of applying `str()`.  E.g. `str(["foo"]) = '["foo"]'`
20 | 
21 |     :Parameters:
22 |         items_datasource : :class:`revscoring.Datasource`
23 |             A datasource that generates a list of items to be hashed
24 |         n : `int`
25 |             The number of potential hashes that can be produced
26 |         name : `str`
27 |             A name for the datasource.
28 |     """
29 | 
30 |     def __init__(self, items_datasource, n=2 ** 20, name=None):
31 |         name = self._format_name(name, [items_datasource, n])
32 |         super().__init__(name, self.process,
33 |                          depends_on=[items_datasource])
34 |         self.n = n
35 | 
36 |     def process(self, items):
37 |         return [mmh3_item(item, self.n) for item in items]
38 | 
39 | 
40 | def mmh3_item(item, n):
41 |     return (2**32 + mmh3.hash(json.dumps(item))) % n
42 | 


--------------------------------------------------------------------------------
/revscoring/datasources/meta/gramming.py:
--------------------------------------------------------------------------------
 1 | """
 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that returns
 3 | a list of strings (i.e. "tokens") and produces a list of ngram/skipgram
 4 | sequences.
 5 | 
 6 | .. autoclass:: revscoring.datasources.meta.gramming.gram
 7 | 
 8 | """
 9 | from ..datasource import Datasource
10 | 
11 | 
12 | class gram(Datasource):
13 |     """
14 |     Converts a sequence of items into ngrams.
15 | 
16 |     :Parameters:
17 |         items_datasource : :class:`revscoring.Datasource`
18 |             A datasource that generates a list of some item
19 |         grams : `list` ( `tuple` ( `int` ) )
20 |             A list of ngram and/or skipgram sequences to produce
21 |         name : `str`
22 |             A name for the datasource.
23 |     """
24 | 
25 |     def __init__(self, items_datasource, grams=[(0,)], name=None):
26 |         name = self._format_name(name, [items_datasource, grams])
27 |         super().__init__(name, self.process,
28 |                          depends_on=[items_datasource])
29 |         self.grams = grams
30 | 
31 |     def process(self, tokens):
32 |         return list(gram_tokens(tokens, grams=self.grams))
33 | 
34 | 
35 | def gram_tokens(items, grams=[(0,)]):
36 |     for i in range(len(items)):
37 |         for gram in grams:
38 |             if gram == (0,):
39 |                 yield (items[i], )
40 |             elif len(items) > i + max(gram):
41 |                 yield tuple(items[i + offset] for offset in gram)
42 | 


--------------------------------------------------------------------------------
/revscoring/scoring/models/util.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import numpy
 4 | 
 5 | 
 6 | def normalize(v):
 7 |     if isinstance(v, numpy.bool_):
 8 |         return bool(v)
 9 |     elif isinstance(v, numpy.ndarray):
10 |         return [normalize(item) for item in v]
11 |     elif v == numpy.NaN:
12 |         return "NaN"
13 |     elif v == numpy.NINF:
14 |         return "-Infinity"
15 |     elif v == numpy.PINF:
16 |         return "Infinity"
17 |     elif isinstance(v, numpy.floating):
18 |         return float(v)
19 |     elif isinstance(v, tuple):
20 |         return list(v)
21 |     else:
22 |         return v
23 | 
24 | 
25 | def key_normalize(v):
26 |     v = normalize(v)
27 |     if isinstance(v, bool) or isinstance(v, int) or isinstance(v, float) or \
28 |        isinstance(v, str):
29 |         return v
30 |     elif isinstance(v, list) or isinstance(v, dict):
31 |         return json.dumps(v)
32 |     else:
33 |         return str(v)
34 | 
35 | 
36 | def normalize_json(doc):
37 |     if isinstance(doc, dict):
38 |         return {key_normalize(k): normalize_json(v)
39 |                 for k, v in doc.items()}
40 |     elif isinstance(doc, list) or isinstance(doc, tuple):
41 |         return [normalize_json(v) for v in doc]
42 |     else:
43 |         return normalize(doc)
44 | 
45 | 
46 | def format_params(doc):
47 |     if doc is None:
48 |         return None
49 |     else:
50 |         return ", ".join("{0}={1}".format(k, json.dumps(v))
51 |                          for k, v in doc.items())
52 | 


--------------------------------------------------------------------------------
/docs/notes_on_adhoc_jobs.txt:
--------------------------------------------------------------------------------
 1 | Given a set of rev_ids.  Return the vandal scores.
 2 | 
 3 | $ cat rev_ids.tsv | predict --source=enwiki_api.yaml --scorer=enwiki_svc.yaml > predictions.tsv
 4 | 
 5 | ^^ This imagines a UNIX command line utility that takes a set of rev_ids and
 6 | makes predictions
 7 | 
 8 | 
 9 | Imagine some python (requires `pip install mwreverts`):
10 | 
11 | # There exists a model file at enwiki.model
12 | from mwapi import Session
13 | import mwreverts
14 | 
15 | from revscoring.extractors.api.extractor import Extractor
16 | from revscoring.scoring.models import LinearSVC
17 | 
18 | session = Session("https://en.wikipedia.org/w/api.php")
19 | extractor = Extractor(session)
20 | 
21 | model = Model.load(open("enwiki.model", "rb"))
22 | 
23 | api_result = session.get(action='query', titles='Main Page', prop='revisions', rvlimit=500, rvprop='sha1|ids')
24 | revisions = next(iter(api_result['query']['pages'].values()))['revisions']
25 | 
26 | # Content that has been revision-deleted has a hidden SHA-1
27 | revisions = [revision for revision in revisions if 'sha1hidden' not in revision]
28 | reverted_set = set()
29 | 
30 | for revert in mwreverts.detect((revision['sha1'], revision) for revision in revisions):
31 |     for reverted in revert.reverteds:
32 |         reverted_set.add(reverted['sha1'])
33 | 
34 | for revision in revisions:
35 |     if revision['sha1'] not in reverted_set:  # no revert happened
36 |         score = model.score([revision['revid']])['probability'][True]
37 | 
38 |         if score > .5:
39 |             print(revision['pagetitle'])
40 | 


--------------------------------------------------------------------------------
/revscoring/features/wikitext/features/revision_oriented.py:
--------------------------------------------------------------------------------
 1 | from revscoring.dependencies import DependentSet
 2 | 
 3 | from . import chars, edit_tokens, parsed, tokenized
 4 | 
 5 | prefix = "wikitext.revision"
 6 | 
 7 | 
 8 | class BaseRevision(DependentSet):
 9 | 
10 |     def __init__(self, name, revision_datasources):
11 |         super().__init__(name)
12 |         self.datasources = revision_datasources
13 | 
14 |         if hasattr(self.datasources, "parent"):
15 |             self.parent = Revision(
16 |                 name + ".parent",
17 |                 self.datasources.parent
18 |             )
19 |             """
20 |             :class:`revscoring.features.wikitext.Revision` : The
21 |             parent (aka "previous") revision of the page.
22 |             """
23 | 
24 |         if hasattr(self.datasources, "diff"):
25 |             self.diff = Diff(
26 |                 name + ".diff",
27 |                 self.datasources.diff
28 |             )
29 |             """
30 |             :class:`~revscoring.features.wikitext.Diff` : The
31 |             difference between this revision and the parent revision.
32 |             """
33 | 
34 | 
35 | class Revision(parsed.Revision, chars.Revision, tokenized.Revision,
36 |                BaseRevision):
37 |     pass
38 | 
39 | 
40 | class BaseDiff(DependentSet):
41 | 
42 |     def __init__(self, name, diff_datasources, *args, **kwargs):
43 |         super().__init__(name)
44 |         self.datasources = diff_datasources
45 | 
46 | 
47 | class Diff(chars.Diff, edit_tokens.Diff, tokenized.Diff, BaseDiff):
48 |     pass
49 | 


--------------------------------------------------------------------------------
/revscoring/datasources/meta/dicts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | These meta-datasources operate on :class:`revscoring.Datasource`'s that
 3 | return `dict`'s
 4 | 
 5 | .. autoclass:: revscoring.datasources.meta.dicts.keys
 6 | 
 7 | .. autoclass:: revscoring.datasources.meta.dicts.values
 8 | 
 9 | """
10 | from ..datasource import Datasource
11 | 
12 | 
13 | class keys(Datasource):
14 |     """
15 |     Generates a set of `dict` keys
16 | 
17 |     :Parameters:
18 |         dict_datasource : :class:`revscoring.Datasource`
19 |             A datasource that generates a `dict`
20 |         name : `str`
21 |             A name for the new datasource.
22 |     """
23 | 
24 |     def __init__(self, dict_datasource, name=None):
25 |         name = self._format_name(name, [dict_datasource])
26 |         super().__init__(name, self.process,
27 |                          depends_on=[dict_datasource])
28 | 
29 |     def process(self, d):
30 |         return (d or {}).keys()
31 | 
32 | 
33 | class values(Datasource):
34 |     """
35 |     Generates a list of `dict` values
36 | 
37 |     :Parameters:
38 |         dict_datasource : :class:`revscoring.Datasource`
39 |             A datasource that generates a `dict`
40 |         name : `str`
41 |             A name for the new datasource.
42 |     """
43 | 
44 |     def __init__(self, dict_datasource, name=None):
45 |         name = self._format_name(name, [dict_datasource])
46 |         super().__init__(name, self.process,
47 |                          depends_on=[dict_datasource])
48 | 
49 |     def process(self, d):
50 |         return [v for v in (d or {}).values()]
51 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/classification/tests/test_micro_macro_stats.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from revscoring.scoring.statistics.classification.micro_macro_stats import \
 4 |     MicroMacroStats
 5 | from revscoring.scoring.statistics.classification.scaled_prediction_statistics import \
 6 |     ScaledPredictionStatistics as SPS # noqa
 7 | 
 8 | 
 9 | def test_micro_macro_stats():
10 |     # (tp, fp, tn, fn)
11 |     stats_keys = ['Short', 'Labels', 'Can', 'Be', 'Columns']
12 |     stats_values = [
13 |         SPS(counts=(10, 2, 5, 8)),
14 |         SPS(counts=(9, 3, 9, 4)),
15 |         SPS(counts=(11, 1, 8, 5)),
16 |         SPS(counts=(10, 2, 9, 4)),
17 |         SPS(counts=(5, 7, 3, 10))
18 |     ]
19 |     stats = OrderedDict()
20 |     for key, value in zip(stats_keys, stats_values):
21 |         stats[key] = value
22 |     mms = MicroMacroStats(stats, 'precision')
23 | 
24 |     print(mms.format_str({}))
25 |     assert len(mms.format_str({}).split('\n')) <= 5
26 |     assert list(stats.keys()) == list(mms['labels'].keys())
27 | 
28 |     # (tp, fp, tn, fn)
29 |     stats = {
30 |         'A really long label name': SPS(counts=(10, 2, 5, 8)),
31 |         'Another long label name': SPS(counts=(9, 3, 9, 4)),
32 |         'Again we\'re very long': SPS(counts=(11, 1, 8, 5)),
33 |         'We should be too long': SPS(counts=(10, 2, 9, 4)),
34 |         'One more for good measure': SPS(counts=(5, 7, 3, 10))
35 |     }
36 |     mms = MicroMacroStats(stats, 'precision')
37 | 
38 |     print(mms.format_str({}))
39 |     assert len(mms.format_str({}).split('\n')) > 5
40 | 


--------------------------------------------------------------------------------
/revscoring/utilities/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module implements a set of utilities for extracting features and
 3 | train/testing :class:`revscoring.Model` from the command-line.  When the
 4 | revscoring python package is installed, a `revscoring` utility should be
 5 | available from the commandline.  Run `revscoring -h` for more
 6 | information:
 7 | 
 8 | check_model
 9 | +++++++++++
10 | .. automodule:: revscoring.utilities.check_model
11 | 
12 | cv_train
13 | ++++++++
14 | .. automodule:: revscoring.utilities.cv_train
15 | 
16 | dump_cache
17 | ++++++++++
18 | .. automodule:: revscoring.utilities.dump_cache
19 | 
20 | extract
21 | +++++++
22 | .. automodule:: revscoring.utilities.extract
23 | 
24 | fetch_idioms
25 | ++++++++++++
26 | .. automodule:: revscoring.utilities.fetch_idioms
27 | 
28 | fetch_text
29 | ++++++++++
30 | .. automodule:: revscoring.utilities.fetch_text
31 | 
32 | fit
33 | +++
34 | .. automodule:: revscoring.utilities.fit
35 | 
36 | intersect_merge_observations
37 | ++++++++++++++++++++++++++++
38 | .. automodule:: revscoring.utilities.intersect_merge_observations
39 | 
40 | model_info
41 | ++++++++++
42 | .. automodule:: revscoring.utilities.model_info
43 | 
44 | score
45 | +++++
46 | .. automodule:: revscoring.utilities.score
47 | 
48 | test_model
49 | ++++++++++
50 | .. automodule:: revscoring.utilities.test_model
51 | 
52 | tune
53 | ++++
54 | .. automodule:: revscoring.utilities.tune
55 | 
56 | union_merge_observations
57 | ++++++++++++++++++++++++
58 | .. automodule:: revscoring.utilities.union_merge_observations
59 | 
60 | util
61 | ++++
62 | .. automodule:: revscoring.utilities.util
63 | """
64 | 


--------------------------------------------------------------------------------
/tests/dependencies/test_context.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from revscoring.dependencies.context import Context
 3 | from revscoring.dependencies.dependent import Dependent
 4 | 
 5 | 
 6 | def test_context():
 7 |     # No context
 8 |     context = Context()
 9 |     foo = Dependent("foo", lambda: "foo")
10 |     bar = Dependent("bar", lambda: "bar")
11 |     foobar = Dependent("foobar", lambda foo, bar: foo + bar,
12 |                        depends_on=[foo, bar])
13 |     assert context.solve(foobar) == "foobar"
14 |     assert list(context.solve([foo, bar, foobar])) == ["foo", "bar", "foobar"]
15 | 
16 |     # Cache context
17 |     context = Context(cache={bar: "baz"})
18 |     assert context.solve(foobar) == "foobaz"
19 | 
20 |     # Context context
21 |     mybar = Dependent("bar", lambda: "baz")
22 | 
23 |     context = Context(context={mybar})
24 |     assert context.solve(foobar) == "foobaz"
25 | 
26 |     context = Context(context={mybar: mybar})
27 |     assert context.solve(foobar) == "foobaz"
28 | 
29 |     context = Context(context={bar: mybar})
30 |     assert context.solve(foobar) == "foobaz"
31 | 
32 |     context = Context(context={bar: lambda: "baz"})
33 |     assert context.solve(foobar) == "foobaz"
34 |     context.update(context={bar: lambda: "buzz"})
35 |     assert context.solve(foobar) == "foobuzz"
36 | 
37 |     assert set(context.expand([foobar])) == {foo, bar, foobar}
38 | 
39 |     context.update(context={bar: bar})
40 |     assert set(context.dig([foobar])) == {foo, bar}
41 | 
42 |     assert (context.draw(foobar) == " - <dependent.foobar>\n" +
43 |             "\t - <dependent.foo>\n" +
44 |             "\t - <dependent.bar>\n")
45 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/matches/matches.py:
--------------------------------------------------------------------------------
 1 | from . import datasources, features
 2 | from ....dependencies import DependentSet
 3 | from ....features import wikitext
 4 | 
 5 | 
 6 | class Matches(DependentSet):
 7 |     def __init__(self, name, matcher, match_list, exclusions=None,
 8 |                  text_preprocess=None):
 9 |         super().__init__(name)
10 |         self._match_list = match_list
11 |         self._exclusions = exclusions
12 |         self.revision = features.Revision(
13 |             name + ".revision",
14 |             datasources.Revision(
15 |                 name + ".revision", matcher,
16 |                 wikitext.revision.datasources,
17 |                 text_preprocess=text_preprocess
18 |             )
19 |         )
20 |         """
21 |         :class:`~revscoring.languages.features.matches.Revision` :
22 |         The base revision feature set.
23 |         """
24 | 
25 |     def excluding(self, exclusions, name=None):
26 |         """
27 |         Returns a new :class:`~revscoring.languages.features.Matches`
28 |         that includes a set of exclusions.
29 | 
30 |         :Parameters:
31 |             exclusions : `list` ( `str` )
32 |                 A list of terms to explicitly not match
33 |             name : `str`
34 |                 A new name for the collection.  If unspecified, the old name
35 |                 will be used
36 |         """
37 |         return self.__class__(
38 |             name or self._name + ".excluding({0!r})".format(exclusions),
39 |             self._match_list,
40 |             exclusions=(self._exclusions or []) + exclusions)
41 | 


--------------------------------------------------------------------------------
/tests/scoring/models/tests/test_random_forest.py:
--------------------------------------------------------------------------------
 1 | from revscoring.scoring.models.model import Model
 2 | from revscoring.scoring.models.random_forest import RandomForest
 3 | 
 4 | from .util import (FEATURES, format_info, pickle_and_unpickle, train_test,
 5 |                    train_test_multilabel)
 6 | 
 7 | 
 8 | def test_random_forest():
 9 |     model = RandomForest(FEATURES, [True, False])
10 |     format_info(model)
11 |     train_test(model)
12 |     reconstructed_model = pickle_and_unpickle(model)
13 |     train_test(reconstructed_model)
14 |     format_info(model)
15 | 
16 |     config = {
17 |         'scorer_models': {
18 |             'test': {
19 |                 'class': "revscoring.scoring.models.RandomForest",
20 |                 'labels': [True, False],
21 |                 'features': [1, 2, 3]
22 |             }
23 |         }
24 |     }
25 |     model = Model.from_config(config, 'test')
26 |     assert isinstance(model, RandomForest)
27 | 
28 | 
29 | def test_random_forest_multilabel():
30 |     model = RandomForest(FEATURES, ["A", "B", "C"], multilabel=True)
31 |     format_info(model)
32 |     train_test_multilabel(model)
33 |     reconstructed_model = pickle_and_unpickle(model)
34 |     train_test_multilabel(reconstructed_model)
35 |     format_info(model)
36 | 
37 |     config = {
38 |         'scorer_models': {
39 |             'test': {
40 |                 'class': "revscoring.scoring.models.RandomForest",
41 |                 'labels': ["A", "B", "C"],
42 |                 'features': [1, 2, 3]
43 |             }
44 |         }
45 |     }
46 |     model = Model.from_config(config, 'test')
47 |     assert isinstance(model, RandomForest)
48 | 


--------------------------------------------------------------------------------
/revscoring/scoring/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains a collection of models that implement a simple function:
 3 | :func:`~revscoring.Model.score`.  Currently, all models are
 4 | a subclass of :class:`revscoring.scoring.models.Learned`
 5 | which means that they also implement
 6 | :meth:`~revscoring.scoring.models.Learned.train` and
 7 | :meth:`~revscoring.scoring.models.Learned.cross_validate`.
 8 | 
 9 | Gradient Boosting
10 | +++++++++++++++++
11 | .. automodule:: revscoring.scoring.models.gradient_boosting
12 | 
13 | Naive Bayes
14 | +++++++++++
15 | .. automodule:: revscoring.scoring.models.naive_bayes
16 | 
17 | Linear Regression
18 | +++++++++++++++++
19 | .. automodule:: revscoring.scoring.models.linear
20 | 
21 | Support Vector
22 | ++++++++++++++
23 | .. automodule:: revscoring.scoring.models.svc
24 | 
25 | Random Forest
26 | +++++++++++++
27 | .. automodule:: revscoring.scoring.models.random_forest
28 | 
29 | Abstract classes
30 | ++++++++++++++++
31 | .. automodule:: revscoring.scoring.models.model
32 | 
33 | SciKit Learn-based models
34 | +++++++++++++++++++++++++
35 | .. automodule:: revscoring.scoring.models.sklearn
36 | 
37 | """
38 | from .gradient_boosting import GradientBoosting
39 | from .linear import LogisticRegression
40 | from .model import Classifier, Learned, open_file
41 | from .naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, NaiveBayes
42 | from .random_forest import RandomForest
43 | from .svc import RBFSVC, SVC, LinearSVC
44 | 
45 | __all__ = [
46 |     Learned, Classifier, open_file,
47 |     SVC, LinearSVC, RBFSVC, NaiveBayes, GaussianNB, MultinomialNB, BernoulliNB,
48 |     RandomForest, GradientBoosting, LogisticRegression
49 | ]
50 | 


--------------------------------------------------------------------------------
/tests/scoring/models/tests/test_gradient_boosting.py:
--------------------------------------------------------------------------------
 1 | from revscoring.scoring.models.gradient_boosting import GradientBoosting
 2 | from revscoring.scoring.models.model import Model
 3 | 
 4 | from .util import (FEATURES, format_info, pickle_and_unpickle, train_test,
 5 |                    train_test_multilabel)
 6 | 
 7 | 
 8 | def test_gradient_boosting():
 9 |     model = GradientBoosting(FEATURES, [True, False])
10 |     format_info(model)
11 |     train_test(model)
12 |     reconstructed_model = pickle_and_unpickle(model)
13 |     train_test(reconstructed_model)
14 |     format_info(model)
15 | 
16 |     config = {
17 |         'scorer_models': {
18 |             'test': {
19 |                 'class': "revscoring.scoring.models.GradientBoosting",
20 |                 'labels': [True, False],
21 |                 'features': [1, 2, 3]
22 |             }
23 |         }
24 |     }
25 |     model = Model.from_config(config, 'test')
26 |     assert isinstance(model, GradientBoosting)
27 | 
28 | 
29 | def test_gradient_boosting_multilabel():
30 |     model = GradientBoosting(FEATURES, ["A", "B", "C"], multilabel=True)
31 |     format_info(model)
32 |     train_test_multilabel(model)
33 |     reconstructed_model = pickle_and_unpickle(model)
34 |     train_test_multilabel(reconstructed_model)
35 |     format_info(model)
36 | 
37 |     config = {
38 |         'scorer_models': {
39 |             'test': {
40 |                 'class': "revscoring.scoring.models.GradientBoosting",
41 |                 'labels': ["A", "B", "C"],
42 |                 'features': [1, 2, 3]
43 |             }
44 |         }
45 |     }
46 |     model = Model.from_config(config, 'test')
47 |     assert isinstance(model, GradientBoosting)
48 | 


--------------------------------------------------------------------------------
/revscoring/features/functions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | .. autofunction:: revscoring.features.trim
 3 | """
 4 | from itertools import chain
 5 | 
 6 | from .feature import Constant, Feature, Modifier
 7 | 
 8 | 
 9 | def trim(features, context=None):
10 |     """
11 |     Trims a feature set down to a bare set of :class:`~revscoring.Feature` by
12 |     removing :class:`~revscoring.features.Modifier` and
13 |     :class:`~revscoring.features.Constant`.
14 | 
15 |     :Parameters:
16 |         features : `list` ( :class:`revscoring.Feature` )
17 |             A feature list to trim
18 |         context : `dict` | `set`
19 |             A context to apply while trimming
20 |     """
21 |     context = context or {}
22 |     cache = set()
23 | 
24 |     if hasattr(features, "__iter__"):
25 |         for feature in features:
26 |             yield from _trim(feature, context, cache)
27 |     else:
28 |         yield from _trim(features, context, cache)
29 | 
30 | 
31 | def _trim(dependent, context, cache):
32 |     if isinstance(dependent, Feature):
33 |         feature = dependent
34 |         if isinstance(feature, Modifier):
35 |             for dependent in feature.dependencies:
36 |                 yield from _trim(dependent, context, cache)
37 |         elif isinstance(feature, Constant):
38 |             pass
39 |         else:
40 |             if feature not in cache:
41 |                 cache.add(feature)
42 |                 yield feature
43 | 
44 | 
45 | def vectorize_values(feature_values):
46 |     """
47 |     Converts a list of feature_values that contains sub-FeatureVector
48 |     into a flat list of values.
49 |     """
50 |     return list(chain(*(val if hasattr(val, "__iter__") else [val]
51 |                         for val in feature_values)))
52 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import sys
 4 | 
 5 | from setuptools import find_packages, setup
 6 | 
 7 | about_path = os.path.join(os.path.dirname(__file__), "revscoring/about.py")
 8 | exec(compile(open(about_path).read(), about_path, "exec"))
 9 | 
10 | 
11 | if sys.version_info <= (3, 0):
12 |     print("Revscoring needs Python 3 to run properly. Your version is " +
13 |           platform.python_version())
14 |     sys.exit(1)
15 | 
16 | 
17 | def read(fname):
18 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
19 | 
20 | 
21 | def requirements(fname):
22 |     return [line.strip()
23 |             for line in open(os.path.join(os.path.dirname(__file__), fname))]
24 | 
25 | 
26 | setup(
27 |     python_requires=">=3",
28 |     name=__name__,  # noqa
29 |     version=__version__,  # noqa
30 |     author=__author__,  # noqa
31 |     author_email=__author_email__,  # noqa
32 |     description=__description__,  # noqa
33 |     url=__url__,  # noqa
34 |     license=__license__,  # noqa
35 |     entry_points={
36 |         'console_scripts': [
37 |             'revscoring = revscoring.revscoring:main',
38 |         ],
39 |     },
40 |     packages=find_packages(),
41 |     long_description=read('README.md'),
42 |     long_description_content_type="text/markdown",
43 |     install_requires=requirements("requirements.txt"),
44 |     include_package_data=True,
45 |     classifiers=[
46 |         "Development Status :: 3 - Alpha",
47 |         "Programming Language :: Python",
48 |         "Programming Language :: Python :: 3",
49 |         "Environment :: Other Environment",
50 |         "Intended Audience :: Developers",
51 |         "License :: OSI Approved :: MIT License",
52 |         "Operating System :: OS Independent"
53 |     ],
54 | )
55 | 


--------------------------------------------------------------------------------
/tests/languages/test_basque.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.languages import basque
 4 | from revscoring.datasources import revision_oriented
 5 | from revscoring.dependencies import solve
 6 | 
 7 | # from .util import compare_extraction
 8 | 
 9 | BAD = [
10 | ]
11 | 
12 | INFORMAL = [
13 | ]
14 | 
15 | OTHER = [
16 | ]
17 | 
18 | r_text = revision_oriented.revision.text
19 | 
20 | 
21 | '''
22 | @mark.nottravis
23 | def test_badwords():
24 |     compare_extraction(basque.badwords.revision.datasources.matches,
25 |                        BAD, OTHER)
26 | 
27 |     assert basque.badwords == pickle.loads(pickle.dumps(basque.badwords))
28 | 
29 | 
30 | @mark.nottravis
31 | def test_informals():
32 |     compare_extraction(basque.informals.revision.datasources.matches,
33 |                        INFORMAL, OTHER)
34 | 
35 |     assert basque.informals == pickle.loads(pickle.dumps(basque.informals))
36 | '''
37 | 
38 | 
39 | def test_dictionary():
40 |     cache = {r_text: "gizonezko dominadun worngly."}
41 |     assert solve(basque.dictionary.revision.datasources.dict_words,
42 |                  cache=cache) == ['gizonezko']
43 |     assert solve(basque.dictionary.revision.datasources.non_dict_words,
44 |                  cache=cache) == ["dominadun", "worngly"]
45 | 
46 |     assert basque.dictionary == pickle.loads(pickle.dumps(basque.dictionary))
47 | 
48 | '''
49 | @mark.nottravis
50 | def test_stopwords():
51 |     cache = {r_text: "আন চলচ্চিত্র."}
52 |     assert (solve(basque.stopwords.revision.datasources.stopwords, cache=cache) ==
53 |             ["আন"])
54 |     assert (solve(basque.stopwords.revision.datasources.non_stopwords,
55 |                   cache=cache) ==
56 |             ['চলচ্চিত্র'])
57 | 
58 |     assert basque.stopwords == pickle.loads(pickle.dumps(basque.stopwords))
59 | '''
60 | 


--------------------------------------------------------------------------------
/revscoring/languages/features/stemmed/datasources.py:
--------------------------------------------------------------------------------
 1 | from ....datasources.meta import frequencies, mappers
 2 | from ....dependencies import DependentSet
 3 | 
 4 | 
 5 | class Revision(DependentSet):
 6 |     def __init__(self, name, stem_word, wikitext_revision):
 7 |         super().__init__(name)
 8 | 
 9 |         self.stems = mappers.map(
10 |             stem_word, wikitext_revision.words,
11 |             name=name + ".stems"
12 |         )
13 | 
14 |         self.stem_frequency = frequencies.table(
15 |             self.stems,
16 |             name=name + ".stem_frequency"
17 |         )
18 | 
19 |         if hasattr(wikitext_revision, 'parent'):
20 |             self.parent = Revision(name + ".parent", stem_word,
21 |                                    wikitext_revision.parent)
22 | 
23 |         if hasattr(wikitext_revision, 'diff'):
24 |             self.diff = Diff(name + ".diff", stem_word,
25 |                              wikitext_revision.diff, self)
26 | 
27 | 
28 | class Diff(DependentSet):
29 |     def __init__(self, name, stem_word, wikitext_diff, revision):
30 |         super().__init__(name)
31 | 
32 |         self.stems_added = mappers.map(
33 |             stem_word, wikitext_diff.words_added,
34 |             name=name + ".stems_added"
35 |         )
36 |         self.stems_removed = mappers.map(
37 |             stem_word, wikitext_diff.words_removed,
38 |             name=name + ".stems_removed"
39 |         )
40 | 
41 |         self.stem_delta = frequencies.delta(
42 |             revision.parent.stem_frequency,
43 |             revision.stem_frequency,
44 |             name=name + ".stem_delta"
45 |         )
46 |         self.stem_prop_delta = frequencies.prop_delta(
47 |             revision.parent.stem_frequency, self.stem_delta,
48 |             name=name + ".stem_prop_delta"
49 |         )
50 | 


--------------------------------------------------------------------------------
/revscoring/features/wikibase/util.py:
--------------------------------------------------------------------------------
 1 | class DictDiff:
 2 |     """
 3 |     Represents the difference between two dictionaries
 4 |     """
 5 |     __slots__ = ('added', 'removed', 'intersection', 'changed', 'unchanged')
 6 | 
 7 |     def __init__(self, added, removed, intersection, changed, unchanged):
 8 |         self.added = added
 9 |         """
10 |         `set` ( `mixed` ) : Keys that were added in the new dictionary
11 |         """
12 | 
13 |         self.removed = removed
14 |         """
15 |         `set` ( `mixed` ) : Keys that were removed in the new dictionary
16 |         """
17 | 
18 |         self.intersection = intersection
19 |         """
20 |         `set` ( `mixed` ) : Keys that appear in both dictionaries
21 |         """
22 | 
23 |         self.changed = changed
24 |         """
25 |         `set` ( `mixed` ) : Keys that appear in both dictionaries, but the
26 |                             values differ
27 |         """
28 | 
29 |         self.unchanged = unchanged
30 |         """
31 |         `set` ( `mixed` ) : Keys that appear in both dictionaries and have
32 |                             equivalent values
33 |         """
34 | 
35 | 
36 | def diff_dicts(a, b):
37 |     """
38 |     Generates a diff between two dictionaries.
39 | 
40 |     :Parameters:
41 |         a : `dict`
42 |             A dict to diff or `None`
43 |         b : `dict`
44 |             B dict to diff
45 |     """
46 |     a = a or {}
47 |     added = b.keys() - a.keys()
48 |     removed = a.keys() - b.keys()
49 |     intersection = a.keys() & b.keys()
50 | 
51 |     changed = set()
52 |     unchanged = set()
53 |     for key in intersection:
54 |         if a[key] == b[key]:
55 |             unchanged.add(key)
56 |         else:
57 |             changed.add(key)
58 | 
59 |     return DictDiff(added, removed, intersection, changed, unchanged)
60 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_vectorizers.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | from unittest.mock import patch
 3 | 
 4 | import pytest
 5 | from revscoring.datasources import revision_oriented as ro
 6 | from revscoring.datasources.meta import vectorizers
 7 | from revscoring.dependencies import solve
 8 | from revscoring.features import wikitext
 9 | 
10 | 
11 | class FakeVectors(dict):
12 |     pass
13 | 
14 | test_vectors = FakeVectors({
15 |                 'a': [1] * 100,
16 |                 'b': [2] * 100,
17 |                 'c': [3] * 100})
18 | test_vectors.vector_size = 100
19 | 
20 | 
21 | def vectorize_words(words):
22 |     return vectorizers.word2vec.vectorize_words(test_vectors, words)
23 | 
24 | 
25 | def test_word2vec():
26 |     wv = vectorizers.word2vec(wikitext.revision.datasources.words,
27 |                               vectorize_words, name='word vectors')
28 |     vector = solve(wv, cache={ro.revision.text: 'a bv c d'})
29 |     assert len(vector) == 2
30 |     assert len(vector[0]) == 100
31 |     vector = solve(wv, cache={ro.revision.text: ''})
32 |     assert len(vector) == 1
33 |     assert len(vector[0]) == 100
34 | 
35 |     assert pickle.loads(pickle.dumps(wv)) == wv
36 | 
37 | 
38 | @patch('gensim.models.keyedvectors')
39 | def test_loadkv_path(kv):
40 |     kv.KeyedVectors.load_word2vec_format.return_value = test_vectors
41 |     vectorizers.KeyedVectors = kv.KeyedVectors
42 |     vectors = vectorizers.word2vec.load_word2vec(path='foo')
43 |     assert vectors is not None
44 | 
45 | 
46 | @patch('gensim.models.keyedvectors')
47 | def test_loadkv_filename_none(kv):
48 |     kv.KeyedVectors.load_word2vec_format.side_effect = FileNotFoundError
49 |     vectorizers.KeyedVectors = kv.KeyedVectors
50 |     assert pytest.raises(FileNotFoundError,
51 |                          vectorizers.word2vec.load_word2vec, filename='foo')
52 | 


--------------------------------------------------------------------------------
/tests/extractors/test_extractor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from revscoring.datasources import Datasource, revision_oriented
 3 | from revscoring.extractors.extractor import Extractor, OfflineExtractor
 4 | 
 5 | 
 6 | def get_last_two(id):
 7 |     return int(str(id)[-2:])
 8 | 
 9 | 
10 | def test_offline_extractor():
11 |     last_two_in_id = Datasource("last_two_in_id", get_last_two,
12 |                                 depends_on=[revision_oriented.revision.id])
13 | 
14 |     extractor = OfflineExtractor()
15 | 
16 |     assert extractor.extract(345678, last_two_in_id) == 78
17 | 
18 |     assert (list(extractor.extract([345678, 4634800], last_two_in_id)) ==
19 |             [(None, 78), (None, 0)])
20 | 
21 |     extraction_profile = {}
22 |     list(extractor.extract([345678, 4634800], last_two_in_id,
23 |                            profile=extraction_profile))
24 |     assert len(extraction_profile) == 1
25 |     assert len(extraction_profile[last_two_in_id]) == 2
26 | 
27 | 
28 | def test_from_config():
29 |     config = {
30 |         'extractors': {
31 |             'enwiki': {
32 |                 'class': "revscoring.extractors.api.Extractor",
33 |                 'host': "https://en.wikipedia.org",
34 |                 'api_path': "/w/api.php",
35 |                 'timeout': 20,
36 |                 'user_agent': "revscoring tests"
37 |             },
38 |             'offline': {
39 |                 'class': "revscoring.extractors.OfflineExtractor"
40 |             }
41 |         }
42 |     }
43 |     Extractor.from_config(config, 'enwiki')
44 |     Extractor.from_config(config, 'offline')
45 | 
46 |     config = {
47 |         'extractors': {
48 |             'offline': {
49 |                 'module': "revscoring.extractors.OfflineExtractor",
50 |             }
51 |         }
52 |     }
53 |     assert Extractor.from_config(config, 'offline') == OfflineExtractor
54 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_extractors.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources.datasource import Datasource
 4 | from revscoring.datasources.meta import extractors
 5 | from revscoring.dependencies import solve
 6 | 
 7 | 
 8 | def return_foo():
 9 |     return "foo"
10 | 
11 | 
12 | segments = Datasource("segments")
13 | 
14 | text = Datasource("text")
15 | 
16 | text_extractor = extractors.regex(["foo bar", "bar foo"], text,
17 |                                   name="text_extractor")
18 | 
19 | exclusion_text_extractor = extractors.regex(["foo+"], text,
20 |                                             name="text_extractor",
21 |                                             exclusions=['foooo'])
22 | 
23 | segment_extractor = extractors.regex(["foo bar", "bar foo"], segments,
24 |                                      name="text_extractor")
25 | 
26 | 
27 | def test_text_extractor():
28 |     cache = {text: "This is some text foo bar nope bar foo"}
29 |     assert solve(text_extractor, cache=cache) == ["foo bar", "bar foo"]
30 |     cache = {text: None}
31 |     assert solve(text_extractor, cache=cache) == []
32 | 
33 |     assert pickle.loads(pickle.dumps(text_extractor)) == text_extractor
34 | 
35 | 
36 | def test_exclusion_text_extractor():
37 |     cache = {text: "This is some text foooo bar nope bar foo fooo"}
38 |     assert solve(exclusion_text_extractor, cache=cache) == ["foo", "fooo"]
39 | 
40 |     assert (pickle.loads(pickle.dumps(exclusion_text_extractor)) ==
41 |             exclusion_text_extractor)
42 | 
43 | 
44 | def test_segment_extractor():
45 |     cache = {segments: ["This is some text foo bar nope bar foo", "foo bar",
46 |                         "foo"]}
47 |     assert (solve(segment_extractor, cache=cache) ==
48 |             ["foo bar", "bar foo", "foo bar"])
49 | 
50 |     assert pickle.loads(pickle.dumps(segment_extractor)) == segment_extractor
51 | 


--------------------------------------------------------------------------------
/revscoring/languages/korean.py:
--------------------------------------------------------------------------------
 1 | from .features import RegexMatches
 2 | 
 3 | name = "korean"
 4 | 
 5 | # No useful dictionary.  hunspell-ko is broken on Ubuntu 14.10
 6 | 
 7 | # No korean stopwords
 8 | 
 9 | # No stemmer
10 | 
11 | 
12 | badword_regexes = [
13 |     r'ㅂㅅ',
14 |     r'ㅅㅂ',
15 |     r'ㅆㅂ',
16 |     r'ㅈㄹ',
17 |     r'간나',
18 |     r'갈보',
19 |     r'개(기[다지]|년|끼|소?리|수작|새끼|자식|좆|차반)',
20 |     r'걸레년',
21 |     r'계집년',
22 |     r'그지새끼',
23 |     r'꼴값',
24 |     r'눈깔',
25 |     r'느금마',
26 |     r'대가리', r'대갈빡',
27 |     r'뒈져라', r'뒤져', r'뒤져라', r'디져라',
28 |     r'또라이',
29 |     r'띠발',
30 |     r'미친놈',
31 |     r'버러지년',
32 |     r'병시나', r'병신',
33 |     r'븅신',
34 |     r'빌어먹을',
35 |     r'빙신',
36 |     r'빡대갈',
37 |     r'뻐큐',
38 |     r'색히',
39 |     r'시부랄',
40 |     r'쌍년', r'쌍놈',
41 |     r'썅', r'썅년', r'썅놈',
42 |     r'쓰레기같은', r'쓰벌',
43 |     r'씨바', r'씨발', r'씨발년', r'씨발놈',
44 |     r'씹구멍', r'씹물', r'씹버러지', r'씹빨', r'씹새', r'씹알', r'씹창',
45 |     r'아가리',
46 |     r'애자',
47 |     r'앰창', r'엠창',
48 |     r'염병', r'옘병',
49 |     r'잡년',
50 |     r'조빱',
51 |     r'존나',
52 |     r'좆같', r'좆까', r'좆나', r'좆만한', r'좆밥', r'좆빠는', r'좆뺑이', r'좆씹',
53 |     r'지랄',
54 |     r'찌질이',
55 |     r'찐따',
56 |     r'창년',
57 |     r'처먹다', r'쳐먹다',
58 |     r'호로자식',
59 |     r'화냥',
60 |     r'후레'
61 | ]
62 | 
63 | badwords = RegexMatches(name + ".badwords", badword_regexes)
64 | """
65 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
66 | badword detecting regexes.
67 | """
68 | 
69 | informal_regexes = [
70 |     r"아니오"
71 |     r"잠시만요"
72 |     r"합니다만", r"입니다만",
73 |     r"\w*니다", r"\w+니까",
74 |     r"\w*세요",
75 |     r"\w*데요",
76 |     r"\w*지요",
77 |     r"\w*네요",
78 |     r"\w*어요",
79 |     r"\w*하죠"
80 | ]
81 | 
82 | informals = RegexMatches(name + ".informals", informal_regexes)
83 | """
84 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
85 | informal word detecting regexes.
86 | """
87 | 


--------------------------------------------------------------------------------
/revscoring/scoring/statistics/classification/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Classification statistics can be generated for "Classifiers" -- models
 3 | that produce factors (aka levels) as an ouput.  E.g. True and False or
 4 | "A", "B", or "C".
 5 | 
 6 | .. autoclass:: revscoring.scoring.statistics.Classification
 7 |     :members:
 8 |     :member-order:
 9 | 
10 | .. autoclass:: revscoring.scoring.statistics.classification.Counts
11 |     :members:
12 |     :member-order:
13 | 
14 | .. autoclass:: revscoring.scoring.statistics.classification.Rates
15 |     :members:
16 |     :member-order:
17 | 
18 | .. autoclass:: revscoring.scoring.statistics.classification.MicroMacroStats
19 |     :members:
20 |     :member-order:
21 | 
22 | .. autoclass:: revscoring.scoring.statistics.classification.ScaledPredictionStatistics
23 |     :members:
24 |     :member-order:
25 | 
26 | .. autoclass:: revscoring.scoring.statistics.classification.ScaledThresholdStatistics
27 |     :members:
28 |     :member-order:
29 | 
30 | .. autoclass:: revscoring.scoring.statistics.classification.ScaledClassificationMatrix
31 |     :members:
32 |     :member-order:
33 | 
34 | .. autoclass:: revscoring.scoring.statistics.classification.ThresholdOptimization
35 |     :members:
36 |     :member-order:
37 | """  # noqa
38 | from .classification import Classification
39 | from .counts import Counts
40 | from .micro_macro_stats import MicroMacroStats
41 | from .rates import Rates
42 | from .scaled_classification_matrix import ScaledClassificationMatrix
43 | from .scaled_prediction_statistics import ScaledPredictionStatistics
44 | from .scaled_threshold_statistics import ScaledThresholdStatistics
45 | from .threshold_optimization import ThresholdOptimization
46 | 
47 | __all__ = [Classification, Counts, Rates, MicroMacroStats,
48 |            ScaledPredictionStatistics, ScaledThresholdStatistics,
49 |            ScaledClassificationMatrix, ThresholdOptimization]
50 | 


--------------------------------------------------------------------------------
/revscoring/extractors/api/util.py:
--------------------------------------------------------------------------------
 1 | from ...datasources import Datasource
 2 | 
 3 | REV_PROPS = {'ids', 'user', 'timestamp', 'userid', 'comment', 'size',
 4 |              'contentmodel'}
 5 | USER_PROPS = {'groups', 'editcount', 'gender', 'registration'}
 6 | 
 7 | 
 8 | def identity(v):
 9 |     return v
10 | 
11 | 
12 | class key(Datasource):
13 | 
14 |     def __init__(self, keys, dict_datasource, name=None, if_missing=None,
15 |                  apply=None):
16 |         self.keys = keys
17 |         self.if_missing = if_missing
18 |         self.apply = apply or identity
19 |         if name is None:
20 |             name = "{0}[{1}]".format(dict_datasource.name, repr(keys))
21 | 
22 |         super().__init__(name, self.process, depends_on=[dict_datasource])
23 | 
24 |     def process(self, d):
25 |         if d is None:
26 |             # Special case for when no doc could be found and that is OK
27 |             return None
28 | 
29 |         try:
30 |             value = _lookup_keys(self.keys, d)
31 |         except KeyError:
32 |             if self.if_missing is not None:
33 |                 Exc, args = self.if_missing[0], self.if_missing[1:]
34 |                 raise Exc(*args)
35 |             else:
36 |                 return None
37 | 
38 |         return self.apply(value)
39 | 
40 | 
41 | class key_exists(Datasource):
42 | 
43 |     def __init__(self, key, dict_datasource, name=None):
44 |         self.key = key
45 |         if name is None:
46 |             name = "{1} in {0}".format(dict_datasource.name, repr(key))
47 | 
48 |         super().__init__(name, self.process, depends_on=[dict_datasource])
49 | 
50 |     def process(self, d):
51 |         return self.key in d
52 | 
53 | 
54 | def _lookup_keys(keys, d):
55 |     if isinstance(keys, str) or not hasattr(keys, "__iter__"):
56 |         keys = [keys]
57 |     try:
58 |         for key in keys:
59 |             d = d[key]
60 |     except KeyError:
61 |         raise KeyError(keys)
62 |     return d
63 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_filters.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import re
 3 | 
 4 | from revscoring.datasources.datasource import Datasource
 5 | from revscoring.datasources.meta import filters
 6 | from revscoring.dependencies import solve
 7 | 
 8 | tokens = Datasource("tokens")
 9 | 
10 | foo_tokens = filters.regex_matching("foo", tokens, name="foo_tokens")
11 | foo_case_tokens = filters.regex_matching(re.compile("foo"), tokens,
12 |                                          name="foo_case_tokens")
13 | 
14 | my_ints = Datasource("my_ints")
15 | 
16 | positive_ints = filters.positive(my_ints)
17 | negative_ints = filters.negative(my_ints)
18 | 
19 | not_none_tokens = filters.not_none(tokens, name="not_none_tokens")
20 | 
21 | 
22 | def test_regex_matching():
23 |     cache = {tokens: ["foo", "bar", "FOO"]}
24 |     assert (solve(foo_tokens, cache=cache) ==
25 |             ["foo", "FOO"])
26 | 
27 |     assert (solve(foo_case_tokens, cache=cache) ==
28 |             ["foo"])
29 | 
30 |     assert pickle.loads(pickle.dumps(foo_tokens)) == foo_tokens
31 |     assert pickle.loads(pickle.dumps(foo_case_tokens)) == foo_case_tokens
32 | 
33 |     assert foo_tokens != foo_case_tokens
34 | 
35 | 
36 | def test_positive():
37 |     cache = {my_ints: [1, 0, -1]}
38 |     assert (solve(positive_ints, cache=cache) ==
39 |             [1])
40 |     assert pickle.loads(pickle.dumps(positive_ints)) == positive_ints
41 | 
42 |     assert positive_ints != negative_ints
43 | 
44 | 
45 | def test_negative():
46 |     cache = {my_ints: [1, 0, -1]}
47 |     assert (solve(negative_ints, cache=cache) ==
48 |             [-1])
49 |     assert pickle.loads(pickle.dumps(negative_ints)) == negative_ints
50 | 
51 |     assert negative_ints != positive_ints
52 | 
53 | 
54 | def test_not_none():
55 |     cache = {tokens: ["foo", None, 1]}
56 |     assert (solve(not_none_tokens, cache=cache) ==
57 |             ["foo", 1])
58 | 
59 |     assert pickle.loads(pickle.dumps(not_none_tokens)) == not_none_tokens
60 | 


--------------------------------------------------------------------------------
/tests/extractors/api/tests/test_util.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from pytest import raises
 4 | 
 5 | from revscoring.datasources import Datasource
 6 | from revscoring.dependencies import solve
 7 | from revscoring.dependencies.util import or_none
 8 | from revscoring.extractors.api.util import _lookup_keys, key, key_exists
 9 | 
10 | 
11 | def test_lookup_keys():
12 | 
13 |     assert _lookup_keys("foo", {'foo': 1}) == 1
14 |     assert _lookup_keys(["foo", "bar"], {'foo': {'bar': 1}}) == 1
15 | 
16 | 
17 | def test_key():
18 |     my_dict = Datasource("my_dict")
19 |     foo = key('foo', my_dict)
20 |     assert solve(foo, cache={my_dict: {'foo': "bar"}}) == 'bar'
21 |     assert repr(foo) == "<datasource.my_dict['foo']>"
22 | 
23 |     bar = key('bar', my_dict, apply=or_none(int))
24 |     assert solve(bar, cache={my_dict: {'bar': None}}) is None
25 |     assert solve(bar, cache={my_dict: {'bar': "1"}}) == 1
26 | 
27 |     foobar = key(['foo', 'bar'], my_dict)
28 |     assert solve(foobar, cache={my_dict: {'bar': 1}}) is None
29 |     assert solve(foobar, cache={my_dict: {'foo': {'bar': 1}}}) == 1
30 |     assert repr(foobar) == "<datasource.my_dict[['foo', 'bar']]>"
31 | 
32 |     assert pickle.loads(pickle.dumps(foo)) == foo
33 |     assert pickle.loads(pickle.dumps(bar)) == bar
34 |     assert pickle.loads(pickle.dumps(foobar)) == foobar
35 | 
36 | 
37 | def test_missing_key():
38 |     with raises(RuntimeError):
39 |         my_dict = Datasource("my_dict")
40 |         foobar = key(['foo', 'bar'], my_dict,
41 |                      if_missing=(RuntimeError))
42 |         assert solve(foobar, cache={my_dict: {'bar': 1}}) is None
43 | 
44 | 
45 | def test_key_exists():
46 |     my_dict = Datasource("my_dict")
47 |     foo_exists = key_exists('foo', my_dict)
48 |     assert solve(foo_exists, cache={my_dict: {'foo': "bar"}}) is True
49 |     assert solve(foo_exists, cache={my_dict: {'baz': "bar"}}) is False
50 |     assert pickle.loads(pickle.dumps(foo_exists)) == foo_exists
51 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_mappers.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources.datasource import Datasource
 4 | from revscoring.datasources.meta import mappers
 5 | from revscoring.dependencies import solve
 6 | 
 7 | tokens = Datasource("tokens")
 8 | my_ints = Datasource("my_ints")
 9 | 
10 | 
11 | def extract_first_char(token):
12 |     return token[:1]
13 | 
14 | 
15 | first_char = mappers.map(extract_first_char, tokens, name="first_char")
16 | 
17 | lower_case_tokens = mappers.lower_case(tokens, name="lower_case_tokens")
18 | 
19 | derepeat_tokens = mappers.derepeat(tokens, name="derepeat_tokens")
20 | 
21 | de1337_tokens = mappers.de1337(tokens, name="de1337_tokens")
22 | 
23 | abs_ints = mappers.abs(my_ints)
24 | 
25 | 
26 | def test_item_mapper():
27 |     cache = {tokens: ["alpha", "bravo", "charlie", "delta"]}
28 |     assert (solve(first_char, cache=cache) ==
29 |             ["a", "b", "c", "d"])
30 | 
31 |     assert pickle.loads(pickle.dumps(first_char)) == first_char
32 | 
33 | 
34 | def test_lower_case():
35 |     cache = {tokens: ["foo", "Bar", "FOO", "İ"]}
36 |     assert (solve(lower_case_tokens, cache=cache) ==
37 |             ["foo", "bar", "foo", "i"])
38 | 
39 |     assert pickle.loads(pickle.dumps(lower_case_tokens)) == lower_case_tokens
40 | 
41 | 
42 | def test_derepeat():
43 |     cache = {tokens: ["foo", "Bar", "FOO"]}
44 |     assert (solve(derepeat_tokens, cache=cache) ==
45 |             ["fo", "Bar", "FO"])
46 | 
47 |     assert pickle.loads(pickle.dumps(derepeat_tokens)) == derepeat_tokens
48 | 
49 | 
50 | def test_de1337():
51 |     cache = {tokens: ["1337", "W4ff1e"]}
52 |     assert (solve(de1337_tokens, cache=cache) ==
53 |             ["leet", "Waffle"])
54 | 
55 |     assert pickle.loads(pickle.dumps(de1337_tokens)) == de1337_tokens
56 | 
57 | 
58 | def test_abs():
59 |     cache = {my_ints: [1, 0, -1]}
60 |     assert (solve(abs_ints, cache=cache) ==
61 |             [1, 0, 1])
62 | 
63 |     assert pickle.loads(pickle.dumps(abs_ints)) == abs_ints
64 | 


--------------------------------------------------------------------------------
/tests/languages/features/matches/test_substrings.py:
--------------------------------------------------------------------------------
 1 | from revscoring.datasources import revision_oriented
 2 | from revscoring.dependencies import solve
 3 | from revscoring.languages.features import SubstringMatches
 4 | 
 5 | substrings = SubstringMatches(
 6 |     "english.idioms",
 7 |     ["blood", "bonding", "friends"],
 8 | )
 9 | 
10 | r_text = revision_oriented.revision.text
11 | p_text = revision_oriented.revision.parent.text
12 | 
13 | 
14 | def test_regexes():
15 |     cache = {p_text: "Blood runs in the family.",
16 |              r_text: "Bonding connects friends and family."}
17 | 
18 |     assert (solve(substrings.revision.datasources.matches, cache=cache) ==
19 |             ['bonding', 'friends'])
20 | 
21 |     assert solve(substrings.revision.matches, cache=cache) == 2
22 | 
23 |     assert (solve(substrings.revision.parent.datasources.matches, cache=cache) ==
24 |             ['blood'])
25 | 
26 |     assert solve(substrings.revision.parent.matches, cache=cache) == 1
27 | 
28 |     diff = substrings.revision.diff
29 | 
30 |     assert solve(diff.datasources.matches_added,
31 |                  cache=cache) == ['bonding', 'friends']
32 | 
33 |     assert solve(diff.matches_added, cache=cache) == 2.0
34 | 
35 |     assert solve(diff.datasources.matches_removed,
36 |                  cache=cache) == ['blood']
37 | 
38 |     assert solve(diff.matches_removed, cache=cache) == 1.0
39 | 
40 |     assert (solve(diff.datasources.match_delta, cache=cache) ==
41 |             {'bonding': 1, 'friends': 1, 'blood': -1})
42 | 
43 |     pd = solve(diff.datasources.match_prop_delta, cache=cache)
44 |     assert pd.keys() == {'bonding', 'friends', 'blood'}
45 | 
46 |     assert round(pd['bonding'], 2) == 1.0
47 |     assert round(pd['friends'], 2) == 1.0
48 |     assert round(pd['blood'], 2) == -1.0
49 | 
50 |     assert round(solve(diff.match_delta_sum, cache=cache), 2) == 1
51 |     assert round(solve(diff.match_prop_delta_sum, cache=cache), 2) == 1.0
52 | 


--------------------------------------------------------------------------------
/tests/extractors/api/tests/test_datasources.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import mwapi
 4 | 
 5 | from revscoring.datasources import revision_oriented as ro
 6 | from revscoring.extractors.api.datasources import (LastUserRevDoc,
 7 |                                                    PageCreationRevDoc,
 8 |                                                    PropertySuggestionDoc,
 9 |                                                    RevDocById, UserInfoDoc)
10 | from revscoring.extractors.api.extractor import Extractor
11 | 
12 | 
13 | def test_rev_doc_by_id():
14 |     extractor = Extractor(mwapi.Session("foobar"))
15 |     rev_doc_by_id = RevDocById(ro.revision, extractor)
16 | 
17 |     hash(rev_doc_by_id)
18 |     assert pickle.loads(pickle.dumps(rev_doc_by_id)) == rev_doc_by_id
19 | 
20 | 
21 | def test_page_creation_rev_doc():
22 |     extractor = Extractor(mwapi.Session("foobar"))
23 |     page_creation_rev_doc = PageCreationRevDoc(ro.revision.page, extractor)
24 | 
25 |     hash(page_creation_rev_doc)
26 |     assert (pickle.loads(pickle.dumps(page_creation_rev_doc)) ==
27 |             page_creation_rev_doc)
28 | 
29 | 
30 | def test_property_suggestion_doc():
31 |     extractor = Extractor(mwapi.Session("foobar"))
32 |     property_suggestion_doc = PropertySuggestionDoc(ro.revision.page, extractor)
33 | 
34 |     hash(property_suggestion_doc)
35 |     assert (pickle.loads(pickle.dumps(property_suggestion_doc)) ==
36 |             property_suggestion_doc)
37 | 
38 | 
39 | def test_user_info_doc():
40 |     extractor = Extractor(mwapi.Session("foobar"))
41 |     user_info_doc = UserInfoDoc(ro.revision.user, extractor)
42 | 
43 |     hash(user_info_doc)
44 |     assert (pickle.loads(pickle.dumps(user_info_doc)) ==
45 |             user_info_doc)
46 | 
47 | 
48 | def test_last_user_rev_doc():
49 |     extractor = Extractor(mwapi.Session("foobar"))
50 |     last_user_rev_doc = LastUserRevDoc(ro.revision, extractor)
51 | 
52 |     hash(last_user_rev_doc)
53 |     assert (pickle.loads(pickle.dumps(last_user_rev_doc)) ==
54 |             last_user_rev_doc)
55 | 


--------------------------------------------------------------------------------
/revscoring/languages/japanese.py:
--------------------------------------------------------------------------------
  1 | from .features import RegexMatches
  2 | 
  3 | name = "japanese"
  4 | 
  5 | # Copied from https://gist.github.com/whym/b5ac3feb2a78797c9d98
  6 | # Yusuke Matsubara (CCO)
  7 | badword_regexes = [
  8 |     r"死ね",
  9 |     r"しね",
 10 |     r"シネ",
 11 |     r"あほ",
 12 |     r"アホ",
 13 |     r"ばか",
 14 |     r"バカ",
 15 |     r"やりまん",
 16 |     r"ヤリマン",
 17 |     r"まんこ",
 18 |     r"マンコ",
 19 |     r"うんこ",
 20 |     r"ウンコ",
 21 |     r"きもい",
 22 |     r"キモイ",
 23 |     r"痴女",
 24 |     r"淫乱",
 25 |     r"在日",
 26 |     r"チョン",
 27 |     r"支那",
 28 |     r"うざい",
 29 |     r"うぜー",
 30 |     r"ｗｗ+",
 31 |     r"ww+"
 32 | ]
 33 | 
 34 | badwords = RegexMatches(name + ".badwords", badword_regexes,
 35 |                         wrapping=False)
 36 | """
 37 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
 38 | badword detecting regexes.
 39 | """
 40 | 
 41 | # Copied from https://gist.github.com/whym/b5ac3feb2a78797c9d98
 42 | # Yusuke Matsubara (CCO)
 43 | informal_regexes = [
 44 |     # Words
 45 |     r"\（笑\）",
 46 |     r"\(笑\)",
 47 |     r"・・・+",
 48 |     r"お願いします",
 49 |     r"こんにちは",
 50 |     r"はじめまして",
 51 |     r"ありがとうございます",
 52 |     r"ありがとうございました",
 53 |     r"すみません",
 54 |     r"思います",
 55 |     r"はい",
 56 |     r"いいえ",
 57 |     r"ですが",
 58 |     r"あなた",
 59 |     r"おっしゃる",
 60 |     # Patterns
 61 |     r"ね。",
 62 |     r"な。",
 63 |     r"よ。",
 64 |     r"わ。",
 65 |     r"が。",
 66 |     r"は。",
 67 |     r"に。",
 68 |     r"か？",
 69 |     r"んか。",
 70 |     r"すか。",
 71 |     r"ます。",
 72 |     r"せん。",
 73 |     r"です。",
 74 |     r"ました。",
 75 |     r"でした。",
 76 |     r"しょう。",
 77 |     r"しょうか。",
 78 |     r"ください。",
 79 |     r"下さい。",
 80 |     r"ますが",
 81 |     r"ですが",
 82 |     r"ましたが",
 83 |     r"でしたが",
 84 |     r"さん、",
 85 |     r"様、",
 86 |     r"ちゃい",
 87 |     r"ちゃう",
 88 |     r"ちゃえ",
 89 |     r"ちゃっ",
 90 |     r"っちゃ",
 91 |     r"じゃない",
 92 |     r"じゃなく"
 93 | ]
 94 | 
 95 | informals = RegexMatches(name + ".informals", informal_regexes,
 96 |                          wrapping=False)
 97 | """
 98 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
 99 | informal word detecting regexes.
100 | """
101 | 


--------------------------------------------------------------------------------
/revscoring/features/wikitext/datasources/sentences.py:
--------------------------------------------------------------------------------
 1 | from deltas.segmenters import MatchableSegment
 2 | 
 3 | from revscoring.datasources import Datasource
 4 | from revscoring.datasources.meta import indexable
 5 | 
 6 | 
 7 | class Revision:
 8 | 
 9 |     def __init__(self, name, revision_datasources):
10 |         super().__init__(name, revision_datasources)
11 | 
12 |         self.sentences = Datasource(
13 |             self._name + ".sentences", psw2sentences,
14 |             depends_on=[self.paragraphs_sentences_and_whitespace]
15 |         )
16 |         """
17 |         A list of "sentences" extracted from the text.
18 |         """
19 | 
20 | 
21 | class Diff():
22 | 
23 |     def __init__(self, *args, **kwargs):
24 |         super().__init__(*args, **kwargs)
25 | 
26 |         self.sentences_added_removed = Datasource(
27 |             self._name + ".sentences_added_removed", set_diff,
28 |             depends_on=[self.revision.sentences,
29 |                         self.revision.parent.sentences]
30 |         )
31 | 
32 |         self.sentences_added = indexable.index(
33 |             0, self.sentences_added_removed,
34 |             name=self._name + ".sentences_added"
35 |         )
36 |         """
37 |         A set of sentences that were added in this edit
38 |         """
39 | 
40 |         self.sentences_removed = indexable.index(
41 |             1, self.sentences_added_removed,
42 |             name=self._name + ".sentences_removed"
43 |         )
44 |         """
45 |         A set of sentences that were removed in this edit
46 |         """
47 | 
48 | 
49 | def psw2sentences(segments):
50 |     sentences = []
51 |     for paragraph_or_whitespace in segments:
52 |         if isinstance(paragraph_or_whitespace, MatchableSegment):
53 |             paragraph = paragraph_or_whitespace  # We have a paragraph
54 |             for sentence_or_whitespace in paragraph:
55 |                 if isinstance(sentence_or_whitespace, MatchableSegment):
56 |                     sentence = sentence_or_whitespace  # We have a sentence
57 |                     sentences.append(sentence)
58 |     return sentences
59 | 
60 | 
61 | def set_diff(a, b):
62 |     a, b = set(a), set(b)
63 |     return (a - b, b - a)
64 | 


--------------------------------------------------------------------------------
/tests/dependencies/test_dependent.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from pytest import raises
 4 | 
 5 | from revscoring.dependencies.dependent import Dependent, DependentSet
 6 | 
 7 | 
 8 | def test_dependent():
 9 | 
10 |     foobar1 = Dependent("foobar", lambda: "foobar1")
11 |     foobar2 = Dependent("foobar", lambda: "foobar2")
12 | 
13 |     assert foobar1 == foobar2
14 |     assert foobar1 != "foo"
15 | 
16 |     assert hash(foobar1) == hash(foobar2)
17 | 
18 |     assert foobar1 in {foobar2}
19 | 
20 | 
21 | def test_name_type():
22 |     with raises(TypeError):
23 |         Dependent(5)  # Name can't be number
24 | 
25 | 
26 | def test_format_name():
27 |     foobar1 = Dependent("foobar")
28 |     assert foobar1._format_name(None, []) == "Dependent()"
29 | 
30 | 
31 | def test_dependent_set():
32 |     my_dependents = DependentSet("my_dependents")
33 |     c = Dependent('c')
34 |     d = Dependent('d')
35 |     e = Dependent('e')
36 |     my_dependents.c = c
37 | 
38 |     assert my_dependents == my_dependents
39 |     assert my_dependents != "foo"
40 |     assert len(my_dependents) == 1
41 |     assert set(my_dependents) == {c}
42 | 
43 |     assert c in my_dependents
44 |     assert d not in my_dependents, set(my_dependents)
45 | 
46 |     my_dependents.d = d
47 | 
48 |     assert my_dependents & {d} == {d}
49 |     assert my_dependents & {e} == set()
50 |     assert my_dependents | {e} == {c, d, e}
51 |     assert my_dependents - {c} == {d}
52 | 
53 |     my_sub_dependents = DependentSet("my_sub_dependents")
54 |     f = Dependent('f')
55 |     my_sub_dependents.f = f
56 |     my_dependents.sub = my_sub_dependents
57 | 
58 |     assert my_sub_dependents.f in my_dependents
59 | 
60 |     assert set(my_dependents) == {c, d, f}
61 |     assert my_dependents & {d} == {d}
62 |     assert my_dependents & {f} == {f}
63 |     assert my_dependents | {e} == {c, d, e, f}
64 |     assert my_dependents - {f} == {c, d}
65 | 
66 |     assert pickle.loads(pickle.dumps(my_dependents)) == my_dependents
67 | 
68 | 
69 | def test_duplicate_feature_warning():
70 |     my_dependents = DependentSet("my_dependents")
71 |     my_dependents.c = Dependent('c')  # Same!
72 |     my_dependents.d = Dependent('d')
73 |     my_dependents.e = Dependent('c')  # Same!
74 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | os: linux
 2 | dist: xenial
 3 | group: stable
 4 | language: python
 5 | python:
 6 |   # Track Python version on future production machines, Debian Stretch.
 7 |   - 3.5
 8 |   - 3.7
 9 |   - 3.8
10 | sudo: required
11 | addons:
12 |   apt:
13 |     packages:
14 |     - g++
15 |     - gfortran
16 |     - libblas-dev
17 |     - liblapack-dev
18 |     - libopenblas-dev
19 |     - python3-dev
20 |     - enchant
21 |     - aspell-ar
22 |     - aspell-bn
23 |     - aspell-el
24 |     - aspell-id
25 |     - aspell-is
26 |     - aspell-pl
27 |     - aspell-ro
28 |     - aspell-sv
29 |     - aspell-ta
30 |     - aspell-uk
31 |     - myspell-cs
32 |     - myspell-de-at
33 |     - myspell-de-ch
34 |     - myspell-de-de
35 |     - myspell-es
36 |     - myspell-et
37 |     - myspell-fa
38 |     - myspell-fr
39 |     - myspell-he
40 |     - myspell-hr
41 |     - myspell-hu
42 |     - myspell-lv
43 |     - myspell-nb
44 |     - myspell-nl
45 |     - myspell-pt-pt
46 |     - myspell-pt-br
47 |     - myspell-ru
48 |     - myspell-hr
49 |     - hunspell-bs
50 |     - hunspell-ca
51 |     - hunspell-en-au
52 |     - hunspell-en-us
53 |     - hunspell-en-gb
54 |     - hunspell-eu
55 |     - hunspell-gl
56 |     - hunspell-it
57 |     - hunspell-hi
58 |     - hunspell-sr
59 |     - hunspell-vi
60 |     - voikko-fi
61 | 
62 | before_install: {}
63 | install:
64 |   - pip install -r requirements.txt
65 |   - pip install -r docs/requirements.txt
66 |   - python -m nltk.downloader stopwords
67 |   - pip install -r test-requirements.txt
68 |   - pip install twine
69 | script:
70 |   - flake8 . --max-line-length=85 --exclude=.svn,CVS,.bzr,.hg,.git,__pycache__,.tox,.eggs,*.egg,docs
71 |   - pytest --cov=revscoring -m "not nottravis"
72 |   - sphinx-build -anW -b html docs dist/docs
73 |   - sphinx-build -b linkcheck docs dist/docs
74 | after_success:
75 |   - codecov
76 | notifications:
77 |   irc:
78 |     channels:
79 |       - "chat.freenode.net#wikimedia-ai"
80 |     on_success: change
81 |     on_failure: change
82 |     template:
83 |       - "%{repository_slug}#%{build_number} (%{branch} - %{commit} : %{author}): %{message} %{build_url}"
84 | 
85 | #deploy:
86 | #  provider: script
87 | #  script: bash scripts/deploy.sh
88 | #  on:
89 | #    branch: master
90 | 


--------------------------------------------------------------------------------
/tests/scoring/models/tests/test_naive_bayes.py:
--------------------------------------------------------------------------------
 1 | from revscoring.scoring.models.model import Model
 2 | from revscoring.scoring.models.naive_bayes import (BernoulliNB, GaussianNB,
 3 |                                                    MultinomialNB)
 4 | 
 5 | from .util import FEATURES, format_info, pickle_and_unpickle, train_test
 6 | 
 7 | 
 8 | def test_gaussian_nb():
 9 |     model = GaussianNB(FEATURES, [True, False])
10 |     format_info(model)
11 |     train_test(model)
12 |     reconstructed_model = pickle_and_unpickle(model)
13 |     train_test(reconstructed_model)
14 |     format_info(model)
15 | 
16 |     config = {
17 |         'scorer_models': {
18 |             'test': {
19 |                 'class': "revscoring.scoring.models.GaussianNB",
20 |                 'labels': [True, False],
21 |                 'features': [1, 2, 3]
22 |             }
23 |         }
24 |     }
25 |     model = Model.from_config(config, 'test')
26 |     assert isinstance(model, GaussianNB)
27 | 
28 | 
29 | def test_multinomial_nb():
30 |     model = MultinomialNB(FEATURES, [True, False])
31 |     format_info(model)
32 | 
33 |     # Fails due to negative feature values.
34 |     # train_score(model)
35 |     # pickle_and_unpickle(model)
36 | 
37 |     config = {
38 |         'scorer_models': {
39 |             'test': {
40 |                 'class': "revscoring.scoring.models.MultinomialNB",
41 |                 'labels': [True, False],
42 |                 'features': [1, 2, 3]
43 |             }
44 |         }
45 |     }
46 |     model = Model.from_config(config, 'test')
47 |     assert isinstance(model, MultinomialNB)
48 | 
49 | 
50 | def test_bernoulli_nb():
51 |     model = BernoulliNB(FEATURES, [True, False])
52 |     format_info(model)
53 |     train_test(model)
54 |     reconstructed_model = pickle_and_unpickle(model)
55 |     train_test(reconstructed_model)
56 |     format_info(model)
57 | 
58 |     config = {
59 |         'scorer_models': {
60 |             'test': {
61 |                 'class': "revscoring.scoring.models.BernoulliNB",
62 |                 'labels': [True, False],
63 |                 'features': [1, 2, 3]
64 |             }
65 |         }
66 |     }
67 |     model = Model.from_config(config, 'test')
68 |     assert isinstance(model, BernoulliNB)
69 | 


--------------------------------------------------------------------------------
/revscoring/features/meta/vectorizers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | These Meta-Features genetate a :class:`revscoring.FeatureVector` based on some
 3 | :class:`revscoring.Datasource`.
 4 | 
 5 | .. autoclass revscoring.features.meta.vectorizers.vectorize
 6 | """
 7 | from ..feature_vector import FeatureVector
 8 | 
 9 | 
10 | class vectorize(FeatureVector):
11 |     """
12 |     Constructs a :class:`revscoring.FeatureVector` that converts a
13 |     dictionary into a list of values with a predictable order based on a set of
14 |     keys.
15 | 
16 |     :Parameters:
17 |         dict_datasource : :class:`revscoring.Datasource`
18 |             A datasource that returns a dictionary of values.  If the
19 |             datasource implements a `keys()` method, that will be used for
20 |             selecting keys to vectorize
21 |         keys : `iterable` ( `hashable` )
22 |             A collection of keys to be vectorized from the dictionary.  If
23 |             specified, this will override the `keys()` method on the
24 |             `dict_datasource`
25 |         returns : `func`
26 |             A function that represents the type of value that will be
27 |             contained in the vector.  When called without an argument, this
28 |             function should return the default value (for missing) keys
29 |             in the dict.
30 |         name : `str`
31 |             A name for the `revscoring.FeatureVector`
32 |     """
33 | 
34 |     def __init__(self, dict_datasource, keys=None, returns=None, name=None):
35 |         if keys is None:
36 |             if hasattr(dict_datasource, "keys"):
37 |                 keys = dict_datasource.keys()
38 |             else:
39 |                 raise AttributeError(
40 |                     "{0} does not have a keys() ".format(dict_datasource) +
41 |                     "method and `keys` argument was not specified")
42 | 
43 |         self.keys = sorted(keys) if keys is not None else None
44 |         name = self._format_name(name, [dict_datasource, self.keys[:10]])
45 |         super().__init__(name, self.process, depends_on=[dict_datasource],
46 |                          returns=returns)
47 |         # Sorting keys so that output is deterministic
48 | 
49 |     def process(self, d):
50 | 
51 |         return [(d[key] if key in d else self.returns()) for key in self.keys]
52 | 


--------------------------------------------------------------------------------
/tests/languages/test_hebrew.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources import revision_oriented
 4 | from revscoring.dependencies import solve
 5 | from revscoring.languages import hebrew
 6 | 
 7 | from .util import compare_extraction
 8 | 
 9 | BAD = [
10 |     "שרמוטה"
11 | ]
12 | 
13 | INFORMAL = [
14 |     "בגללך"  # Because of you
15 | ]
16 | 
17 | OTHER = [
18 |     "בגלל", "חתול"
19 | ]
20 | 
21 | r_text = revision_oriented.revision.text
22 | 
23 | 
24 | def test_badwords():
25 |     compare_extraction(hebrew.badwords.revision.datasources.matches,
26 |                        BAD, OTHER)
27 | 
28 |     assert hebrew.badwords == pickle.loads(pickle.dumps(hebrew.badwords))
29 | 
30 | 
31 | def test_informals():
32 |     compare_extraction(hebrew.informals.revision.datasources.matches,
33 |                        INFORMAL, OTHER)
34 | 
35 |     assert hebrew.informals == pickle.loads(pickle.dumps(hebrew.informals))
36 | 
37 | 
38 | def test_dictionary():
39 |     cache = {r_text: "סוויפט גדלה בוויומיסינג, לנאשוויל"}
40 |     assert (solve(hebrew.dictionary.revision.datasources.dict_words, cache=cache) ==
41 |             ['גדלה'])
42 |     assert (solve(hebrew.dictionary.revision.datasources.non_dict_words,
43 |                   cache=cache) ==
44 |             ['סוויפט', 'בוויומיסינג', 'לנאשוויל'])
45 | 
46 |     assert hebrew.dictionary == pickle.loads(pickle.dumps(hebrew.dictionary))
47 | 
48 | 
49 | """ TODO:
50 | def test_stopwords():
51 |     cache = {r_text: "סוויפט גדלה בוויומיסינג, פנסילבניה, לנאשוויל"}
52 |     assert_equal(solve(hebrew.stopwords.revision.datasources.stopwprds, cache=cache),
53 |         ["סוויפט", "גדלה", "בוויומיסינג", "פנסילבניה", "לנאשוויל"])
54 |     assert_equal(solve(hebrew.stopwords.revision.datasources.non_stopwords,
55 |               cache=cache),
56 |         ["סוויפט", "גדלה", "בוויומיסינג", "פנסילבניה", "לנאשוויל"])
57 | 
58 |     assert_equal(hebrew.stopwords, pickle.loads(pickle.dumps(hebrew.stopwords)))
59 | 
60 | 
61 | def test_stemmed():
62 |     cache = {r_text: "סוויפט גדלה בוויומיסינג, פנסילבניה, לנאשוויל"}
63 |     assert_equal(solve(hebrew.stemmed.revision.datasources.stems, cache=cache),
64 |         ["סוויפט", "גדלה", "בוויומיסינג", "פנסילבניה", "לנאשוויל"])
65 | 
66 |     assert_equal(hebrew.stemmed, pickle.loads(pickle.dumps(hebrew.stemmed)))
67 | """
68 | 


--------------------------------------------------------------------------------
/revscoring/utilities/intersect_merge_observations.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``revscoring intersect_merge_observations -h``
 3 | ::
 4 | 
 5 |     Intersect observation data.  Fields will be merged.  Data is triaged
 6 |     according to the order of filenames in the commandline arguments,
 7 |     with the later files taking preference over earlier.
 8 | 
 9 |     Usage:
10 |         intersect_merge_observations -h | --help
11 |         intersect_merge_observations <input>...
12 |             [--output=<path>]
13 |             [--id-column=<str>]
14 | 
15 |     Options:
16 |         <input>           List of input file paths
17 |         --output=<path>   Path to write out the merged observations
18 |                           [default: <stdout>]
19 |         --id-column=<str> Name of the id column for deduplication.
20 |                           [default: rev_id]
21 | """
22 | 
23 | import sys
24 | 
25 | import deep_merge
26 | import docopt
27 | 
28 | from .util import dump_observation, read_observations
29 | 
30 | 
31 | def main(argv=None):
32 |     """Parse commandline parameters, read files and write merged data.
33 |     """
34 |     args = docopt.docopt(__doc__, argv=argv)
35 | 
36 |     if args['--output'] == "<stdout>":
37 |         out_file = sys.stdout
38 |     else:
39 |         out_file = open(args['--output'], "w")
40 | 
41 |     observation_sets = (read_observations(open(path, "r"))
42 |                         for path in args['<input>'])
43 | 
44 |     intersected_observations = intersect_merge_observations(
45 |         observation_sets, id_column=args['--id-column'])
46 | 
47 |     for ob in intersected_observations:
48 |         dump_observation(ob, out_file)
49 | 
50 | 
51 | def intersect_merge_observations(observation_sets, id_column):
52 |     """Intersect all observations, returning the output as an iterable.
53 |     """
54 |     observation_maps = [
55 |         {ob[id_column]: ob for ob in observation_set}
56 |         for observation_set in observation_sets]
57 | 
58 |     for id_ in observation_maps[0]:
59 |         # Key exists in all sets
60 |         if sum(id_ in om for om in observation_maps) == len(observation_maps):
61 |             new_ob = {}
62 |             for observation_map in observation_maps:
63 |                 new_ob = deep_merge.merge(new_ob, observation_map[id_])
64 | 
65 |             yield new_ob
66 | 


--------------------------------------------------------------------------------
/tests/scoring/models/tests/test_svc.py:
--------------------------------------------------------------------------------
 1 | from revscoring.scoring.models.model import Model
 2 | from revscoring.scoring.models.svc import RBFSVC, SVC, LinearSVC
 3 | 
 4 | from .util import FEATURES, format_info, pickle_and_unpickle, train_test
 5 | 
 6 | 
 7 | def test_svc():
 8 |     model = SVC(FEATURES, [True, False])
 9 |     format_info(model)
10 |     train_test(model)
11 |     reconstructed_model = pickle_and_unpickle(model)
12 |     train_test(reconstructed_model)
13 |     format_info(model)
14 | 
15 |     model = SVC(FEATURES, [True, False], scale=True, center=True)
16 |     format_info(model)
17 |     train_test(model)
18 |     reconstructed_model = pickle_and_unpickle(model)
19 |     train_test(reconstructed_model)
20 |     format_info(model)
21 | 
22 |     config = {
23 |         'scorer_models': {
24 |             'test': {
25 |                 'class': "revscoring.scoring.models.SVC",
26 |                 'labels': [True, False],
27 |                 'features': [1, 2, 3]
28 |             }
29 |         }
30 |     }
31 |     model = Model.from_config(config, 'test')
32 |     assert isinstance(model, SVC)
33 | 
34 | 
35 | def test_linear_svc():
36 |     model = LinearSVC(FEATURES, [True, False])
37 |     format_info(model)
38 |     train_test(model)
39 |     reconstructed_model = pickle_and_unpickle(model)
40 |     train_test(reconstructed_model)
41 |     format_info(model)
42 | 
43 |     config = {
44 |         'scorer_models': {
45 |             'test': {
46 |                 'class': "revscoring.scoring.models.LinearSVC",
47 |                 'labels': [True, False],
48 |                 'features': [1, 2, 3]
49 |             }
50 |         }
51 |     }
52 |     model = Model.from_config(config, 'test')
53 |     assert isinstance(model, LinearSVC)
54 | 
55 | 
56 | def test_rbf_svc():
57 |     model = RBFSVC(FEATURES, [True, False])
58 |     format_info(model)
59 |     train_test(model)
60 |     reconstructed_model = pickle_and_unpickle(model)
61 |     train_test(reconstructed_model)
62 |     format_info(model)
63 | 
64 |     config = {
65 |         'scorer_models': {
66 |             'test': {
67 |                 'class': "revscoring.scoring.models.RBFSVC",
68 |                 'labels': [True, False],
69 |                 'features': [1, 2, 3]
70 |             }
71 |         }
72 |     }
73 |     model = Model.from_config(config, 'test')
74 |     assert isinstance(model, RBFSVC)
75 | 


--------------------------------------------------------------------------------
/tests/features/meta/tests/test_bools.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import re
 3 | 
 4 | from revscoring.datasources import Datasource
 5 | from revscoring.dependencies import solve
 6 | from revscoring.features.meta import bools
 7 | 
 8 | my_item = Datasource("my_item")
 9 | 
10 | my_set = Datasource("my_set")
11 | 
12 | my_string = Datasource("my_string")
13 | 
14 | 
15 | def test_regex_match():
16 |     starts_with_t = bools.regex_match(r"^t", my_string)
17 | 
18 |     assert (solve(starts_with_t, cache={my_string: "Foo"}) is
19 |             False)
20 |     assert (solve(starts_with_t, cache={my_string: "too"}) is
21 |             True)
22 |     assert (solve(starts_with_t, cache={my_string: "Too"}) is
23 |             True)
24 | 
25 |     assert pickle.loads(pickle.dumps(starts_with_t)) == starts_with_t
26 | 
27 |     starts_with_lower_t = bools.regex_match(re.compile(r"^t"), my_string)
28 | 
29 |     assert (solve(starts_with_lower_t, cache={my_string: "Foo"}) is
30 |             False)
31 |     assert (solve(starts_with_lower_t, cache={my_string: "too"}) is
32 |             True)
33 |     assert (solve(starts_with_lower_t, cache={my_string: "Too"}) is
34 |             False)
35 | 
36 |     assert pickle.loads(pickle.dumps(starts_with_lower_t)
37 |                         ) == starts_with_lower_t
38 | 
39 | 
40 | def test_item_in_set():
41 |     is_a_sysop = bools.item_in_set('sysop', my_set)
42 | 
43 |     assert solve(is_a_sysop, cache={my_set: {'foo', 'bar'}}) is False
44 |     assert solve(is_a_sysop, cache={my_set: {'foo', 'sysop'}}) is True
45 |     assert solve(is_a_sysop, cache={my_set: None}) is False
46 | 
47 |     assert pickle.loads(pickle.dumps(is_a_sysop)) == is_a_sysop
48 | 
49 | 
50 | def test_set_contains_item():
51 |     is_me = bools.set_contains_item({6877667}, my_item)
52 | 
53 |     assert solve(is_me, cache={my_item: 999}) is False
54 |     assert solve(is_me, cache={my_item: 6877667}) is True
55 |     assert solve(is_me, cache={my_item: None}) is False
56 | 
57 |     assert pickle.loads(pickle.dumps(is_me)) == is_me
58 | 
59 | 
60 | def test_sets_intersect():
61 |     has_small_odd = bools.sets_intersect({1, 2, 3, 5, 7, 9, 11, 13}, my_set)
62 | 
63 |     assert solve(has_small_odd, cache={my_set: {4, 18, 10}}) is False
64 |     assert solve(has_small_odd, cache={my_set: {20, 10, 3, 5, 1}}) is True
65 |     assert solve(has_small_odd, cache={my_set: None}) is False
66 | 
67 |     assert pickle.loads(pickle.dumps(has_small_odd)) == has_small_odd
68 | 


--------------------------------------------------------------------------------
/revscoring/utilities/union_merge_observations.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``revscoring union_merge_observations -h``
 3 | ::
 4 | 
 5 |     Merge labeled revisions, taking the union of values for any rows with
 6 |     the same id.  Data is triaged according to the order of filenames in the
 7 |     commandline arguments, with the later files taking preference over earlier.
 8 |     Behavior is not specified if an input file has duplicate revisions within
 9 |     it.
10 | 
11 |     FIXME: Reading everything into memory is reckless.  Estimate where this
12 |     hits a wall.
13 | 
14 |     Usage:
15 |         union_merge_observations -h | --help
16 |         union_merge_observations <input>...
17 |             [--output=<path>]
18 |             [--id-column=<str>]
19 | 
20 |     Options:
21 |         <input>           List of input file paths
22 |         --output=<path>   Path to write out the merged observations
23 |                           [default: <stdout>]
24 |         --id-column=<str> Name of the id field for deduplication.
25 |                           [default: rev_id]
26 | """
27 | 
28 | import collections
29 | import itertools
30 | import sys
31 | 
32 | import deep_merge
33 | import docopt
34 | 
35 | from .util import dump_observation, read_observations
36 | 
37 | 
38 | def main(argv=None):
39 |     """Parse commandline parameters, read files and write merged data.
40 |     """
41 |     args = docopt.docopt(__doc__, argv=argv)
42 | 
43 |     if args['--output'] == "<stdout>":
44 |         out_file = sys.stdout
45 |     else:
46 |         out_file = open(args['--output'], "w")
47 | 
48 |     observation_chunks = (read_observations(open(path, "r"))
49 |                           for path in args['<input>'])
50 |     all_observations = itertools.chain(*observation_chunks)
51 | 
52 |     merged_observations = union_merge_observations(
53 |         all_observations, id_column=args['--id-column'])
54 |     for ob in merged_observations:
55 |         dump_observation(ob, out_file)
56 | 
57 | 
58 | def union_merge_observations(observations, id_column):
59 |     """Merge all observations, returning the output as a list.
60 |     """
61 |     id_map = collections.defaultdict(dict)
62 |     for ob in observations:
63 |         # Get the id value.
64 |         ob_id = ob[id_column]
65 | 
66 |         # Merge the contents, with later entries taking precedence when keys
67 |         # match.
68 |         id_map[ob_id] = deep_merge.merge(id_map[ob_id], ob)
69 | 
70 |     return id_map.values()
71 | 


--------------------------------------------------------------------------------
/revscoring/features/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module implements a set of :class:`revscoring.Feature`
 3 | for use in scoring revisions.  :class:`revscoring.Feature`
 4 | lists can be provided to a :func:`revscoring.dependencies.solve`, or
 5 | more commonly, to a :class:`revscoring.Extractor` to obtain simple
 6 | numerical/boolean values that can be used when modeling revision
 7 | scores.  The provided features are split conceptually into a set of modules:
 8 | 
 9 | Feature collections
10 | +++++++++++++++++++
11 | 
12 | :mod:`~revscoring.features.revision_oriented`
13 |     Basic features of revisions. E.g. ``revision.user.text_matches(r'.*Bot')``
14 | :mod:`~revscoring.features.bytes`
15 |     Features of the number of bytes of content, byte length of characters,
16 |     etc.
17 | :mod:`~revscoring.features.temporal`
18 |     Features of the time between events of a interest. E.g.
19 |     ``revision.user.last_revision.seconds_since``
20 | :mod:`~revscoring.features.wikibase`
21 |     Features of wikibase items and changes made to them. E.g.
22 |     ``revision.diff.property_changed('P31')``
23 | :mod:`~revscoring.features.wikitext`
24 |     Features of wikitext content and differences between revisions. E.g.
25 |     ``revision.diff.uppercase_words_added``
26 | 
27 | Functions
28 | +++++++++
29 | 
30 | .. automodule:: revscoring.features.functions
31 | 
32 | Meta-features
33 | +++++++++++++
34 | Meta-Features are classes that extend :class:`~revscoring.Feature` and
35 | implement common operations on :class:`~revscoring.Datasource` like
36 | :class:`~revscoring.features.meta.aggregators.sum` and
37 | :class:`~revscoring.features.meta.bools.item_in_set`.  See
38 | :mod:`revscoring.features.meta` for the full list.
39 | 
40 | Modifiers
41 | +++++++++
42 | Modifiers are functions that can be applied to a :class:`revscoring.Feature`
43 | to modify the value.  E.g. :class:`~revscoring.features.modifiers.log`,
44 | :class:`~revscoring.features.modifiers.max` and
45 | :class:`~revscoring.features.modifiers.add`.
46 | See :mod:`~revscoring.features.modifiers` for the full list.
47 | 
48 | Base classes
49 | ++++++++++++
50 | 
51 | .. automodule:: revscoring.features.feature
52 | 
53 | .. automodule:: revscoring.features.feature_vector
54 | """
55 | 
56 | from .feature import Constant, Feature, Modifier
57 | from .feature_vector import FeatureVector
58 | from .functions import trim, vectorize_values
59 | 
60 | __all__ = [Feature, Modifier, Constant, FeatureVector, trim, vectorize_values]
61 | 


--------------------------------------------------------------------------------
/revscoring/revscoring.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Provides access to a set of utilities for working with revision scorer models.
 3 | 
 4 | Utilities:
 5 | 
 6 | * check_model       Compares a models construction environment snapshot to the
 7 |                     current environment
 8 | * cv_train          Cross-validates, and then trains a Model with extracted features
 9 | * dump_cache        Reads a cache file and dumps out a set of features, target
10 |                     label, and (optionally) score documents in a TSV file.
11 | * extract           Extracts a cache of dependencies for a set of observations
12 | * fetch_idioms      Fetches a list of English idioms from English Wiktionary
13 | * fetch_text        Fetches text for a set of observations
14 | * fit               Fits a dependent to observed data
15 | * intersect_merge_observations   Intersect observation data
16 | * model_info        Reads a model-file and reports metadata and testing
17 |                     statistics
18 | * score             Scores a set of revisions using a trained model
19 | * test_model        Tests an MLScorerModel with extracted features
20 | * tune              Tunes a set of models against a training set to identify
21 |                     the best model/configuration
22 | * union_merge_observations   Merge labeled revisions, taking the union of
23 |                     values for any rows with the same id
24 | 
25 | Usage:
26 |     revscoring (-h | --help)
27 |     revscoring <utility> [-h|--help]
28 | """  # noqa
29 | 
30 | import sys
31 | import traceback
32 | from importlib import import_module
33 | 
34 | 
35 | USAGE = """Usage:
36 |     revscoring (-h | --help)
37 |     revscoring <utility> [-h|--help]\n"""
38 | 
39 | 
40 | def main():
41 |     if len(sys.argv) < 2:
42 |         sys.stderr.write(USAGE)
43 |         sys.exit(1)
44 |     elif sys.argv[1] in ("-h", "--help"):
45 |         sys.stderr.write(__doc__ + "\n")
46 |         sys.exit(1)
47 |     elif sys.argv[1][:1] == "-":
48 |         sys.stderr.write(USAGE)
49 |         sys.exit(1)
50 | 
51 |     module_name = sys.argv[1]
52 |     try:
53 |         module = import_module(".utilities." + module_name,
54 |                                package="revscoring")
55 |     except ImportError:
56 |         sys.stderr.write(traceback.format_exc())
57 |         sys.stderr.write("Could not find utility {0}.\n".format(module_name))
58 |         sys.exit(1)
59 | 
60 |     module.main(sys.argv[2:])
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     main()
65 | 


--------------------------------------------------------------------------------
/tests/utilities/test_union_merge_observations.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os.path
 3 | import tempfile
 4 | 
 5 | from revscoring.utilities.union_merge_observations import (
 6 |     main, union_merge_observations)
 7 | 
 8 | 
 9 | def test_union_merge():
10 |     """Merge and inspect results.
11 |     """
12 |     observations = [
13 |         {"rev_id": 101, "goodfaith": False, "damaging": True},
14 |         {"rev_id": 101, "goodfaith": True, "damaging": True},
15 |         {"rev_id": 102, "goodfaith": False, "damaging": False},
16 |     ]
17 |     expected = [
18 |         {"rev_id": 101, "goodfaith": True, "damaging": True},
19 |         {"rev_id": 102, "goodfaith": False, "damaging": False},
20 |     ]
21 |     result = union_merge_observations(observations, "rev_id")
22 |     assert expected == list(result)
23 | 
24 | 
25 | def test_union_cli():
26 |     """Integration test to parse and run from a commandline.  This tests file
27 |     loading.
28 |     """
29 |     # Get test fixtures.
30 |     data_dir = os.path.dirname(__file__) + "/data"
31 |     in_files = [
32 |         data_dir + "/labeled_revisions.json",
33 |         data_dir + "/labeled_foo.json",
34 |     ]
35 | 
36 |     # Receive the output in a temporary file.
37 |     (fh, out_file) = tempfile.mkstemp()
38 | 
39 |     # Do the union.
40 |     argv = in_files + ["--output", out_file]
41 |     main(argv)
42 | 
43 |     # Get results.
44 |     with open(out_file, "r") as f:
45 |         out_text = f.read()
46 | 
47 |     # Clean up test file :(  It would be better if a context manager could
48 |     # ensure we don't abort before cleaning up.
49 |     os.unlink(out_file)
50 | 
51 |     # Split result into lines.
52 |     lines = out_text.strip().split("\n")
53 | 
54 |     # There should be six records.
55 |     assert len(lines) == 6
56 | 
57 |     # If this counter is set to 1, it tells us that our row of interest was
58 |     # present and not duplicated.
59 |     count_merged = 0
60 | 
61 |     # Spot check a couple o' lines.
62 |     for line in lines:
63 |         obj = json.loads(line)
64 |         if obj["rev_id"] == "16124458":
65 |             assert obj == {"damaging": 0, "goodfaith": 1, "approved": 1,
66 |                            "rev_id": "16124458"}
67 |         if obj["rev_id"] == "16124390":
68 |             assert obj == {"damaging": 0, "approved": 1, "goodfaith": 0,
69 |                            "foo": 1, "rev_id": "16124390"}
70 |             count_merged += 1
71 | 
72 |     assert count_merged == 1
73 | 


--------------------------------------------------------------------------------
/revscoring/languages/finnish.py:
--------------------------------------------------------------------------------
  1 | from .features import RegexMatches, Stopwords
  2 | 
  3 | name = "finnish"
  4 | 
  5 | # No dictionary
  6 | 
  7 | # No stemmer
  8 | 
  9 | try:
 10 |     from nltk.corpus import stopwords as nltk_stopwords
 11 |     stopwords = set(nltk_stopwords.words('finnish'))
 12 | except LookupError:
 13 |     raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
 14 |                       "You may need to install the nltk 'stopwords' " +
 15 |                       "corpora.  See http://www.nltk.org/data.html")
 16 | 
 17 | stopwords = Stopwords(name + ".stopwords", stopwords)
 18 | """
 19 | :class:`~revscoring.languages.features.Stopwords` features provided by
 20 | `nltk.corpus.stopwords <https://www.nltk.org/api/nltk.corpus.html>`_ "finnish"
 21 | """
 22 | 
 23 | badword_regexes = [
 24 |     r"homo",
 25 |     r"homoja",
 26 |     r"homot",
 27 |     r"hintti",
 28 |     r"homppeli",
 29 |     r"huora",
 30 |     r"idiootti",
 31 |     r"jumalauta",
 32 |     r"juntti",
 33 |     r"kakka",
 34 |     r"kakkaa",
 35 |     r"kikkeli",
 36 |     r"kyrpä",
 37 |     r"kulli",
 38 |     r"kusi",
 39 |     r"kusipää",
 40 |     r"läski",
 41 |     r"mamu",
 42 |     r"matu",
 43 |     r"neekeri",
 44 |     r"nussii",
 45 |     r"narttu",
 46 |     r"paska",
 47 |     r"paskaa",
 48 |     r"paskat",
 49 |     r"paskin",
 50 |     r"paskova",
 51 |     r"pelle",
 52 |     r"perse",
 53 |     r"perseeseen",
 54 |     r"perseessä",
 55 |     r"perseestä",
 56 |     r"perseenreikä",
 57 |     r"perkele",
 58 |     r"pillu",
 59 |     r"pilluun",
 60 |     r"pippeli",
 61 |     r"pieru",
 62 |     r"retardi",
 63 |     r"runkkari",
 64 |     r"saatana",
 65 |     r"saatanan",
 66 |     r"tyhmä",
 67 |     r"vammane",
 68 |     r"vammanen",
 69 |     r"vittu",
 70 |     r"vitun",
 71 |     r"äpärä"
 72 | ]
 73 | 
 74 | badwords = RegexMatches(name + ".badwords", badword_regexes)
 75 | """
 76 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
 77 | badword detecting regexes.
 78 | """
 79 | 
 80 | informal_regexes = [
 81 |     r"haistakaa",
 82 |     r"imekää",
 83 |     r"lol",
 84 |     r"ootte",
 85 |     r"moi",
 86 |     r"hei",
 87 |     r"sinä",
 88 |     r"sä",
 89 |     r"minä",
 90 |     r"mää",
 91 |     r"ok",
 92 |     r"joo",
 93 |     r"okei"
 94 | ]
 95 | 
 96 | informals = RegexMatches(name + ".informals", informal_regexes)
 97 | """
 98 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
 99 | informal word detecting regexes.
100 | """
101 | 


--------------------------------------------------------------------------------
/tests/languages/test_japanese.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | from revscoring.languages import japanese
  4 | 
  5 | from .util import compare_extraction
  6 | 
  7 | BAD = [
  8 |     "死ね",
  9 |     "しね",
 10 |     "シネ",
 11 |     "あほ",
 12 |     "アホ",
 13 |     "ばか",
 14 |     "バカ",
 15 |     "やりまん",
 16 |     "ヤリマン",
 17 |     "まんこ",
 18 |     "マンコ",
 19 |     "うんこ",
 20 |     "ウンコ",
 21 |     "きもい",
 22 |     "キモイ",
 23 |     "痴女",
 24 |     "淫乱",
 25 |     "在日",
 26 |     "チョン",
 27 |     "支那",
 28 |     "うざい",
 29 |     "うぜー",
 30 |     "ｗｗｗｗ",
 31 |     "wwww",
 32 |     "ｗｗｗｗｗｗｗｗ",
 33 |     "wwwwwwwwwwwwwww"
 34 | ]
 35 | 
 36 | INFORMAL = [
 37 |     # Words
 38 |     "（笑）",
 39 |     "(笑)",
 40 |     "・・・",
 41 |     "お願いします",
 42 |     "こんにちは",
 43 |     "はじめまして",
 44 |     "ありがとうございます",
 45 |     "ありがとうございました",
 46 |     "すみません",
 47 |     "思います",
 48 |     "はい",
 49 |     "いいえ",
 50 |     "ですが",
 51 |     "あなた",
 52 |     "おっしゃる",
 53 | 
 54 |     # sub-word patterns
 55 |     "ね。",
 56 |     "な。",
 57 |     "よ。",
 58 |     "わ。",
 59 |     "が。",
 60 |     "は。",
 61 |     "に。",
 62 |     "か？",
 63 |     "んか。",
 64 |     "すか。",
 65 |     "ます。",
 66 |     "せん。",
 67 |     "です。",
 68 |     "ました。",
 69 |     "でした。",
 70 |     "しょう。",
 71 |     "しょうか。",
 72 |     "ください。",
 73 |     "下さい。",
 74 |     "ますが",
 75 |     "ですが",
 76 |     "ましたが",
 77 |     "でしたが",
 78 |     "さん、",
 79 |     "様、",
 80 |     "ちゃい",
 81 |     "ちゃう",
 82 |     "ちゃえ",
 83 |     "ちゃっ",
 84 |     "っちゃ",
 85 |     "じゃない",
 86 |     "じゃなく"
 87 | ]
 88 | 
 89 | OTHER = [
 90 |     """
 91 |     本項で解説する地方病とは、山梨県における日本住血吸虫症の呼称であり、
 92 |     長い間その原因が明らかにならず住民を苦しめた感染症である。ここでは、
 93 |     その克服・撲滅に至る歴史について説明する。
 94 |     この疾患は住血吸虫類に分類される寄生虫である日本住血吸虫の寄生によって発症する寄生虫病であり、
 95 |     ヒトを含む哺乳類全般の血管内部に寄生感染する人獣共通感染症でもある。
 96 | 
 97 |     病名および原虫に日本の国名が冠されているのは、
 98 |     疾患の原因となる病原体（日本住血吸虫）の生体が、
 99 |     世界で最初に日本国内（現：山梨県甲府市）で発見されたことによるものであって、
100 |     日本固有の疾患というわけではない。日本住血吸虫症は、中国、フィリピン、
101 |     インドネシアの3カ国を中心に、
102 |     年間数千人から数万人規模の新規感染患者が発生しており、
103 |     世界保健機関　(WHO)などによって、さまざまな対策が行われている。
104 |     """
105 | ]
106 | 
107 | 
108 | def test_badwords():
109 |     compare_extraction(japanese.badwords.revision.datasources.matches,
110 |                        BAD, OTHER)
111 | 
112 |     assert japanese.badwords == pickle.loads(pickle.dumps(japanese.badwords))
113 | 
114 | 
115 | def test_informals():
116 |     compare_extraction(japanese.informals.revision.datasources.matches,
117 |                        INFORMAL, OTHER)
118 | 
119 |     assert japanese.informals == pickle.loads(pickle.dumps(japanese.informals))
120 | 


--------------------------------------------------------------------------------
/revscoring/scoring/statistics/classification/rates.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from collections import OrderedDict
 3 | 
 4 | from tabulate import tabulate
 5 | 
 6 | from ... import util
 7 | from ...model_info import ModelInfo
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | MAX_COLUMNS_WIDTH_CHARS = 80
12 | 
13 | 
14 | class Rates(ModelInfo):
15 | 
16 |     def __init__(self, counts, population_rates=None):
17 |         super().__init__()
18 |         self['sample'] = OrderedDict(
19 |             (label, lcount / counts['n'])
20 |             for label, lcount in counts['labels'].items())
21 |         if population_rates:
22 |             self['population'] = OrderedDict(
23 |                 (label, population_rates[label]) for label in counts['labels'])
24 | 
25 |     def format_str(self, path_tree, ndigits=3, **kwargs):
26 |         if len(path_tree) > 0:
27 |             logger.warn("Ignoring path_tree={0!r} while formatting rates."
28 |                         .format(path_tree))
29 | 
30 |         formatted = "rates:\n"
31 |         table_str = self.format_table(ndigits)
32 |         formatted += util.tab_it_in(table_str)
33 |         return formatted
34 | 
35 |     def format_json(self, path_tree, ndigits=3, **kwargs):
36 |         doc = OrderedDict()
37 |         for key in path_tree or self.keys():
38 |             sub_tree = path_tree.get(key, {})
39 |             if len(sub_tree) > 0:
40 |                 logger.warn("Ignoring path_tree={0!r} while formatting rates."
41 |                             .format(sub_tree))
42 |             group = self[key]
43 |             doc[key] = {l: util.round(group[l], ndigits) for l in group}
44 |         return doc
45 | 
46 |     def format_table(self, ndigits):
47 |         column_header_width = sum(max(len(str(l)) + 2, ndigits + 4)
48 |                                   for l in self['sample'])
49 |         if column_header_width < MAX_COLUMNS_WIDTH_CHARS:
50 |             return self.format_column_major_table(ndigits)
51 |         else:
52 |             return self.format_row_major_table(ndigits)
53 | 
54 |     def format_column_major_table(self, ndigits):
55 |         return tabulate(
56 |             [[group] + [util.round(self[group].get(label), ndigits)
57 |                            for label in self['sample']]
58 |              for group in self],
59 |             headers=[''] + [repr(l) for l in self['sample']])
60 | 
61 |     def format_row_major_table(self, ndigits):
62 |         return tabulate(
63 |             [([label] +
64 |               [util.round(self[group][label], ndigits) for group in self])
65 |              for label in self['sample']],
66 |             headers=[''] + list(self.keys()))
67 | 


--------------------------------------------------------------------------------
/revscoring/languages/vietnamese.py:
--------------------------------------------------------------------------------
 1 | from .features import Dictionary, RegexMatches, Stopwords
 2 | 
 3 | name = "vietnamese"
 4 | 
 5 | try:
 6 |     import enchant
 7 |     dictionary = enchant.Dict("vi")
 8 | except enchant.errors.DictNotFoundError:
 9 |     raise ImportError("No enchant-compatible dictionary found for 'vi'.  " +
10 |                       "Consider installing 'hunspell-vi'.")
11 | 
12 | dictionary = Dictionary(name + ".dictionary", dictionary.check)
13 | """
14 | :class:`~revscoring.languages.features.Dictionary` features via
15 | `enchant.Dict <https://github.com/rfk/pyenchant>`_ "vi". Provided by `hunspell-vi`.
16 | """
17 | 
18 | # https://vi.wiktionary.org/wiki/Th%C3%A0nh_vi%C3%AAn:Laurent_Bouvier/
19 | # Free_Vietnamese_Dictionary_Project_Vietnamese-Vietnamese#Allwiki_.28closed.29
20 | stopwords = set([
21 |     "ai", "bằng", "bị", "bộ", "cho", "chưa", "chỉ", "cuối", "cuộc",
22 |     "các", "cách", "cái", "có", "cùng", "cũng", "cạnh", "cả", "cục",
23 |     "của", "dùng", "dưới", "dừng", "giữa", "gì", "hay", "hoặc",
24 |     "khi", "khác", "không", "luôn", "là", "làm", "lại", "mà", "mọi",
25 |     "mỗi", "một", "nhiều", "như", "nhưng", "nào", "này", "nữa",
26 |     "phải", "qua", "quanh", "quá", "ra", "rất", "sau", "sẽ", "sự",
27 |     "theo", "thành", "thêm", "thì", "thứ", "trong", "trên", "trước",
28 |     "trừ", "tuy", "tìm", "từng", "và", "vài", "vào", "vì", "vẫn",
29 |     "về", "với", "xuống", "đang", "đã", "được", "đấy", "đầu", "đủ"
30 | ])
31 | 
32 | stopwords = Stopwords(name + ".stopwords", stopwords)
33 | """
34 | :class:`~revscoring.languages.features.Stopwords` features copied from
35 | https://vi.wiktionary.org/wiki/Th%C3%A0nh_vi%C3%AAn:Laurent_Bouvier/Free_Vietnamese_Dictionary_Project_Vietnamese-Vietnamese#Allwiki_.28closed.29
36 | """  # noqa
37 | 
38 | badword_regexes = [
39 |     # Vietnamese
40 |     r"[ck]ặ[tc]", r"[ck]u", r"cứt", r"(dz?|gi)âm", r"đái", r"đéo", r"đ[ụù]",
41 |     r"đĩ", r"đ[íị]t", r"ỉa", r"l[ôồ]n", r"trứng"
42 | ]
43 | 
44 | badwords = RegexMatches(name + ".badwords", badword_regexes)
45 | """
46 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
47 | badword detecting regexes.
48 | """
49 | 
50 | informal_regexes = [
51 |     # Vietnamese
52 |     r"bợn", r"bro",
53 |     r"chẳng", r"ch[ớứ]", r"cú",
54 |     r"đụ", r"đừng", r"fải",
55 |     r"khỉ",
56 |     r"mày", r"nghịch", r"ngu", r"ngụy", r"nguỵ",
57 |     r"ok", r"ơi",
58 |     r"quái",
59 |     r"thằng", r"thôi", r"tui", r"ừ", r"vời", r"wái?",
60 |     r"zì"
61 | ]
62 | 
63 | informals = RegexMatches(name + ".informals", informal_regexes)
64 | """
65 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
66 | informal word detecting regexes.
67 | """
68 | 


--------------------------------------------------------------------------------
/revscoring/utilities/test_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``revscoring test_model -h``
 3 | ::
 4 | 
 5 |     Tests a scorer model.  This utility expects to get a file of
 6 |     tab-separated feature values and labels from which to test a model.
 7 | 
 8 |     Usage:
 9 |         test_model -h | --help
10 |         test_model <scorer_model> <label>
11 |                    [--observations=<path>]
12 |                    [--model-file=<path>]
13 |                    [--debug]
14 | 
15 |     Options:
16 |         -h --help               Prints this documentation
17 |         <scoring-model>         Path to model file that already trained.
18 |         <label>                 The name of the field to be predicted
19 |         --observations=<path>   Path to a file containing observations
20 |                                 containing a 'cache' [default: <stdin>]
21 |         --model-file=<path>     Path to write a model file to
22 |         --debug                 Print debug logging.
23 | """
24 | import logging
25 | import sys
26 | 
27 | import docopt
28 | 
29 | from ..dependencies import solve
30 | from ..scoring import Model, models
31 | from .util import read_observations
32 | 
33 | logger = logging.getLogger(__name__)
34 | 
35 | 
36 | def main(argv=None):
37 |     args = docopt.docopt(__doc__, argv=argv)
38 | 
39 |     logging.basicConfig(
40 |         level=logging.INFO if not args['--debug'] else logging.DEBUG,
41 |         format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
42 |     )
43 | 
44 |     scoring_model = Model.load(models.open_file(args['<scorer_model>']))
45 | 
46 |     if args['--observations'] == "<stdin>":
47 |         observations = read_observations(sys.stdin)
48 |     else:
49 |         observations = read_observations(open(args['--observations']))
50 | 
51 |     label_name = args['<label>']
52 |     value_labels = \
53 |         [(solve(scoring_model.features, cache=ob['cache']), ob[label_name])
54 |          for ob in observations]
55 | 
56 |     if args['--model-file'] is None:
57 |         model_file = None
58 |     else:
59 |         model_file = open(args['--model-file'], 'wb')
60 | 
61 |     run(scoring_model, value_labels, model_file)
62 | 
63 | 
64 | def run(scoring_model, value_labels, model_file):
65 | 
66 |     scoring_model = test_model(scoring_model, value_labels)
67 |     sys.stderr.write(scoring_model.info.format())
68 |     sys.stderr.write("\n\n")
69 | 
70 |     if model_file is not None:
71 |         scoring_model.dump(model_file)
72 | 
73 | 
74 | def test_model(scoring_model, value_labels):
75 | 
76 |     logger.debug("Test set: {0}".format(len(value_labels)))
77 | 
78 |     logger.info("Testing model...")
79 |     scoring_model.test(value_labels)
80 | 
81 |     return scoring_model
82 | 


--------------------------------------------------------------------------------
/revscoring/scoring/util.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from .. import errors
 4 | 
 5 | 
 6 | def parse_pattern(string):
 7 |     """
 8 |     Parse a statistic lookup pattern
 9 |     """
10 |     return list(_parse_pattern(string))
11 | 
12 | 
13 | def _parse_pattern(string):
14 |     if len(string) > 0:
15 |         parts = string.split(".")
16 |         buf = []
17 |         for part in parts:
18 |             if buf:
19 |                 if part[-1] in ('"', "'") and part[-1] == buf[0][0]:
20 |                     yield (''.join(buf + [part])).strip("'\"")
21 |                     buf = []
22 |                 else:
23 |                     buf.append(part + ".")
24 |             elif part[0] in ('"', "'"):
25 |                 if part[-1] in ('"', "'") and part[0] == part[-1]:
26 |                     yield part.strip(part[0])
27 |                 else:
28 |                     buf.append(part + ".")
29 |             else:
30 |                 yield part
31 | 
32 |         if buf:
33 |             raise ValueError("Parsing error unmatching quotes {0}"
34 |                              .format(''.join(buf)))
35 | 
36 | 
37 | _round = round
38 | 
39 | 
40 | def round(number, ndigits=0):
41 |     if number is None:
42 |         return None
43 |     else:
44 |         return int(_round((number * (10**ndigits)))) / (10**ndigits)
45 | 
46 | 
47 | def tab_it_in(string, tabs=1):
48 |     return "".join("\t" * tabs + "{0}\n".format(line)
49 |                    for line in string.split("\n"))
50 | 
51 | 
52 | def tree_assignment(tree, path):
53 |     d = tree
54 |     for k in path:
55 |         if k not in d:
56 |             d[k] = OrderedDict()
57 |         d = d[k]
58 | 
59 | 
60 | def treeify(paths):
61 |     tree = {}
62 |     for path in paths:
63 |         tree_assignment(tree, path)
64 |     return tree
65 | 
66 | 
67 | def dict_lookup(d, path_tree):
68 |     path_tree = path_tree or {}
69 |     if len(path_tree) == 0:
70 |         return d
71 |     else:
72 |         return OrderedDict(
73 |             (key, dict_lookup(d[key], sub_tree))
74 |             for key, sub_tree in path_tree.items())
75 | 
76 | 
77 | def check_label_consistency(actual_labels, expected_labels):
78 |     expected_labels = set(expected_labels)
79 |     unique_labels = set(actual_labels)
80 |     if unique_labels - expected_labels:
81 |         raise errors.ModelConsistencyError(
82 |             "Labels {0} not in list of expected labels {1}"
83 |             .format(unique_labels - expected_labels, expected_labels))
84 |     elif expected_labels - unique_labels:
85 |         raise errors.ModelConsistencyError(
86 |             "Expected labels {0} not represented in the training set"
87 |             .format(expected_labels - unique_labels))
88 | 


--------------------------------------------------------------------------------
/revscoring/utilities/fetch_idioms.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``revscoring fetch_idioms -h``
 3 | ::
 4 | 
 5 |     Gets a list of English language idioms from en.wiktionary.org.
 6 | 
 7 |     Usage:
 8 |         fetch_idioms [--output=<path>]
 9 |                    [--verbose] [--debug]
10 | 
11 |     Options:
12 |         -h --help        Print this documentation
13 |         --output=<path>  Path to a file to write the idioms
14 |                          [default: <stdout>]
15 |         --verbose        Print dots and stuff to note progress
16 |         --debug          Print debug logging
17 | """
18 | 
19 | import logging
20 | import re
21 | import sys
22 | 
23 | import docopt
24 | import mwapi
25 | 
26 | from .util import dump_observation
27 | 
28 | 
29 | def main(argv=None):
30 |     args = docopt.docopt(__doc__, argv=argv)
31 | 
32 |     logging.basicConfig(
33 |         level=logging.INFO if not args['--debug'] else logging.DEBUG,
34 |         format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
35 |     )
36 |     logging.getLogger('requests').setLevel(logging.WARNING)
37 | 
38 |     if args['--output'] == "<stdout>":
39 |         output = sys.stdout
40 |     else:
41 |         output = open(args['--output'], 'w')
42 | 
43 |     verbose = args['--verbose']
44 | 
45 |     run(output, verbose)
46 | 
47 | 
48 | def is_idiom(phrase):
49 |     """
50 |         Checks if a phrase meets certain criteria to make it an idiom
51 |     """
52 |     if re.match('Category:English|Citation:|Appendix:', phrase):
53 |         return False
54 |     # One word phrases
55 |     if re.match(r"^[\w\-\']+$", phrase):
56 |         return False
57 |     # Two-worded phrases
58 |     if re.match(r"^[\w\-\']+ [\w\-\']+$", phrase):
59 |         return False
60 |     # Similes
61 |     if 'as a' in phrase:
62 |         return False
63 |     return True
64 | 
65 | 
66 | def fetch():
67 |     session = mwapi.Session("https://en.wiktionary.org")
68 | 
69 |     results = session.get(
70 |         action='query',
71 |         list='categorymembers',
72 |         cmtitle="Category:English idioms",
73 |         formatversion=2,
74 |         continuation=True)
75 |     idioms = []
76 |     for doc in results:
77 |         for page_doc in doc['query']['categorymembers']:
78 |             phrase = page_doc['title']
79 |             if not is_idiom(phrase):
80 |                 continue
81 |             idioms.append(phrase)
82 |     return idioms
83 | 
84 | 
85 | def run(output, verbose):
86 |     logger = logging.getLogger(__name__)
87 |     if verbose:
88 |         logger.info('Fetching idioms...')
89 | 
90 |     idioms = fetch()
91 |     for idiom in idioms:
92 |         dump_observation(idiom, output)
93 | 


--------------------------------------------------------------------------------
/tests/languages/test_korean.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | from revscoring.datasources import revision_oriented
  4 | from revscoring.languages import korean
  5 | 
  6 | from .util import compare_extraction
  7 | 
  8 | BAD = [
  9 |     'ㅂㅅ',
 10 |     'ㅅㅂ',
 11 |     'ㅆㅂ',
 12 |     'ㅈㄹ',
 13 |     '간나',
 14 |     '갈보',
 15 |     '개기다', '개기지', '개년', '개새끼', '개소리', '개수작', '개자식', '개좆', '개좆', '개차반',
 16 |     '걸레년',
 17 |     '계집년',
 18 |     '그지새끼',
 19 |     '꼴값',
 20 |     '눈깔',
 21 |     '느금마',
 22 |     '대가리', '대갈빡',
 23 |     '뒈져라', '뒤져', '뒤져라', '디져라',
 24 |     '또라이',
 25 |     '띠발',
 26 |     '미친놈',
 27 |     '버러지년',
 28 |     '병시나', '병신',
 29 |     '븅신',
 30 |     '빌어먹을',
 31 |     '빙신',
 32 |     '빡대갈',
 33 |     '뻐큐',
 34 |     '색히',
 35 |     '시부랄',
 36 |     '쌍년', '쌍놈',
 37 |     '썅', '썅년', '썅놈',
 38 |     '쓰레기같은', '쓰벌',
 39 |     '씨바', '씨발', '씨발년', '씨발놈',
 40 |     '씹구멍', '씹물', '씹버러지', '씹빨', '씹새', '씹알', '씹창',
 41 |     '아가리',
 42 |     '애자',
 43 |     '앰창', '엠창',
 44 |     '염병', '옘병',
 45 |     '잡년',
 46 |     '조빱',
 47 |     '존나',
 48 |     '좆같', '좆까', '좆나', '좆만한', '좆밥', '좆빠는', '좆뺑이', '좆씹',
 49 |     '지랄',
 50 |     '찌질이',
 51 |     '찐따',
 52 |     '창년',
 53 |     '처먹다', '쳐먹다',
 54 |     '호로자식',
 55 |     '화냥',
 56 |     '후레'
 57 | ]
 58 | 
 59 | INFORMAL = [
 60 |     "아니오"
 61 |     "잠시만요"
 62 |     "합니다만", "입니다만"
 63 |     "알겠습니다", "죄송합니다", "그렇습니다", "모르겠습니까"
 64 |     "식사하세요", "그러세요", "모르세요", "해보세요"
 65 |     "이건데요", "있는데요",
 66 |     "이거지요", "이상하지요", "됐지요"
 67 |     "됐네요", "쐈네요",
 68 |     "알겠어요", "됐어요",
 69 |     "하죠"
 70 | ]
 71 | 
 72 | 
 73 | OTHER = [
 74 |     """
 75 |     요하네스 케플러(Johannes Kepler, 1571년 12월 27일 - 1630년 11월 15일)는
 76 |     독일의 수학자, 천문학자, 점성술사로, 17세기 천문학 혁명의 중심 인물이었다. 그는 자신의
 77 |     이름이 붙은 행성운동법칙으로 유명하며, 후대의 천문학자들은 그의 저작 《신천문학》,
 78 |     《우주의 조화》, 그리고 《코페르니쿠스 천문학 개요》를 바탕으로 그 법칙을 성문화하였다.
 79 |     또한 이 저작들은 아이작 뉴턴이 만유인력의 법칙을 확립하는 데 기초를 제공하였다.
 80 |     생애 동안 케플러는 오스트리아 그라츠 신학교의 수학 선생, 천문학자 튀코 브라헤의
 81 |     조수, 루돌프 2세·마티아스·페르디난트 2세의 세 황제를 모신 신성 로마 제국의 제국
 82 |     수학자, 오스트리아 린츠에서의 수학 선생, 발렌슈타인 장군의 점성술사라는 다양한 경력의
 83 |     소유자였다. 또한 그는 광학 연구 분야의 초석을 닦았으며, 굴절 망원경을 개조하여
 84 |     성능을 향상시켰으며(케플러식 망원경), 동시대의 인물인 갈릴레오 갈릴레이의 망원경을
 85 |     이용한 발견이 공식적으로 인정되는 데 공헌하였다.
 86 |     """,
 87 | ]
 88 | 
 89 | r_text = revision_oriented.revision.text
 90 | 
 91 | 
 92 | def test_badwords():
 93 |     compare_extraction(korean.badwords.revision.datasources.matches,
 94 |                        BAD, OTHER)
 95 | 
 96 |     assert korean.badwords == pickle.loads(pickle.dumps(korean.badwords))
 97 | 
 98 | 
 99 | def test_informals():
100 |     compare_extraction(korean.informals.revision.datasources.matches,
101 |                        INFORMAL, OTHER)
102 | 
103 |     assert korean.informals == pickle.loads(pickle.dumps(korean.informals))
104 | 


--------------------------------------------------------------------------------
/revscoring/languages/romanian.py:
--------------------------------------------------------------------------------
 1 | from .features import Dictionary, RegexMatches, Stemmed, Stopwords
 2 | 
 3 | name = "romanian"
 4 | 
 5 | try:
 6 |     import enchant
 7 |     dictionary = enchant.Dict("ro")
 8 | except enchant.errors.DictNotFoundError:
 9 |     raise ImportError("No enchant-compatible dictionary found for 'ro'.  " +
10 |                       "Consider installing 'aspell-ro'.")
11 | 
12 | dictionary = Dictionary(name + ".dictionary", dictionary.check)
13 | """
14 | :class:`~revscoring.languages.features.Dictionary` features via
15 | `enchant.Dict <https://github.com/rfk/pyenchant>`_ "ru".  Provided by `aspell-ro`
16 | """
17 | 
18 | try:
19 |     from nltk.corpus import stopwords as nltk_stopwords
20 |     stopwords = set(nltk_stopwords.words('romanian'))
21 | except LookupError:
22 |     raise ImportError("Could not load stopwords for {0}. ".format(__name__) +
23 |                       "You may need to install the nltk 'stopwords' " +
24 |                       "corpora.  See http://www.nltk.org/data.html")
25 | 
26 | stopwords = Stopwords(name + ".stopwords", stopwords)
27 | """
28 | :class:`~revscoring.languages.features.Stopwords` features provided by
29 | `nltk.corpus.stopwords <https://www.nltk.org/api/nltk.corpus.html>`_ "romanian"
30 | """
31 | 
32 | try:
33 |     from nltk.stem.snowball import SnowballStemmer
34 |     stemmer = SnowballStemmer("romanian")
35 | except ValueError:
36 |     raise ImportError("Could not load stemmer for {0}. ".format(__name__))
37 | 
38 | stemmed = Stemmed(name + ".stemmed", stemmer.stem)
39 | """
40 | :class:`~revscoring.languages.features.Stemmed` word features via
41 | :class:`nltk.stem.snowball.SnowballStemmer` "romanian"
42 | """
43 | 
44 | badword_regexes = [
45 |     r"bou",
46 |     r"cacat?",
47 |     r"cur(u|v[ae])?",
48 |     r"dracu",
49 |     r"fraier(i(lor)?)?",
50 |     r"fut(e|ut)?",
51 |     r"kkt",
52 |     r"laba",
53 |     r"mata",
54 |     r"mui(e|st)",
55 |     r"pidar",
56 |     r"pizda",
57 |     r"plm",
58 |     r"porcarie",
59 |     r"pul[aei]+",
60 |     r"sug(e(ti)?|i)",
61 |     r"supt"
62 | ]
63 | 
64 | badwords = RegexMatches(name + ".badwords", badword_regexes)
65 | """
66 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
67 | badword detecting regexes.
68 | """
69 | 
70 | informal_regexes = [
71 |     "aia", "asa",
72 |     "aste?a",
73 |     "a(ve)?ti", "aveti",
74 |     "bag(at)?", "bagat",
75 |     "bla+",
76 |     "naspa",
77 |     "prost(i[ei]?|ilor)?", "prosti", "prostie", "prostii", "prostilor",
78 |     "rahat",
79 |     "smecher",
80 |     "tigani"
81 | ]
82 | 
83 | informals = RegexMatches(name + ".informals", informal_regexes)
84 | """
85 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
86 | informal word detecting regexes.
87 | """
88 | 


--------------------------------------------------------------------------------
/tests/datasources/meta/tests/test_frequencies.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources.datasource import Datasource
 4 | from revscoring.datasources.meta import frequencies
 5 | from revscoring.dependencies import solve
 6 | 
 7 | old_tokens = Datasource("old_tokens")
 8 | new_tokens = Datasource("new_tokens")
 9 | 
10 | old_ft = frequencies.table(old_tokens, name="old_ft")
11 | new_ft = frequencies.table(new_tokens, name="new_ft")
12 | 
13 | delta = frequencies.delta(old_ft, new_ft, name="delta")
14 | pos_delta = frequencies.positive(delta, name="pos_delta")
15 | neg_delta = frequencies.negative(delta, name="neg_delta")
16 | neg_abs_delta = frequencies.negative(
17 |     delta, absolute=True, name="neg_abs_delta")
18 | 
19 | prop_delta = frequencies.prop_delta(old_ft, delta, name="prop_delta")
20 | 
21 | 
22 | def test_table():
23 |     cache = {new_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45}
24 |     assert (solve(new_ft, cache=cache) ==
25 |             {'a': 3, 'b': 2, 'c': 45})
26 | 
27 |     assert (pickle.loads(pickle.dumps(new_ft)) ==
28 |             new_ft)
29 | 
30 | 
31 | def test_delta():
32 |     cache = {old_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45,
33 |              new_tokens: ["a"] * 1 + ["b"] * 5 + ["d"] * 3}
34 |     assert (solve(delta, cache=cache) ==
35 |             {'a': -2, 'b': 3, 'c': -45, 'd': 3})
36 | 
37 |     assert (pickle.loads(pickle.dumps(delta)) ==
38 |             delta)
39 | 
40 | 
41 | def test_prop_delta():
42 |     cache = {old_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45 + ["e"] * 2,
43 |              new_tokens: ["a"] * 1 + ["b"] * 5 + ["d"] * 3 + ["e"] * 3}
44 | 
45 |     pd = solve(prop_delta, cache=cache)
46 |     assert pd.keys() == {'a', 'b', 'c', 'd', 'e'}
47 |     assert round(pd['a'], 2) == -0.67
48 |     assert round(pd['b'], 2) == 1
49 |     assert round(pd['c'], 2) == -1
50 |     assert round(pd['d'], 2) == 3
51 |     assert round(pd['e'], 2) == 0.33
52 | 
53 |     assert (pickle.loads(pickle.dumps(prop_delta)) ==
54 |             prop_delta)
55 | 
56 | 
57 | def test_positive():
58 |     cache = {old_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45 + ["e"] * 2,
59 |              new_tokens: ["a"] * 1 + ["b"] * 5 + ["d"] * 3 + ["e"] * 3}
60 |     assert (solve(pos_delta, cache=cache) ==
61 |             {'b': 3, 'd': 3, 'e': 1})
62 | 
63 | 
64 | def test_negative():
65 |     cache = {old_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45 + ["e"] * 2,
66 |              new_tokens: ["a"] * 1 + ["b"] * 5 + ["d"] * 3 + ["e"] * 3}
67 |     assert (solve(neg_delta, cache=cache) ==
68 |             {'a': -2, 'c': -45})
69 | 
70 |     cache = {old_tokens: ["a"] * 3 + ["b"] * 2 + ["c"] * 45 + ["e"] * 2,
71 |              new_tokens: ["a"] * 1 + ["b"] * 5 + ["d"] * 3 + ["e"] * 3}
72 |     assert (solve(neg_abs_delta, cache=cache) ==
73 |             {'a': 2, 'c': 45})
74 | 


--------------------------------------------------------------------------------
/tests/languages/test_persian.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources import revision_oriented
 4 | from revscoring.dependencies import solve
 5 | from revscoring.languages import persian
 6 | 
 7 | from .util import compare_extraction
 8 | 
 9 | BAD = [
10 |     "جنده",
11 |     "کاکاسیاه",
12 |     "آشغال",
13 |     "آله",
14 |     "ایتالیک",
15 |     "بخواب",
16 |     "برووتو",
17 |     "جمهورمحترم",
18 |     "فرمود",
19 |     "فرمودند",
20 |     "فرموده",
21 |     "لعنت",
22 |     "مشنگ",
23 |     "ننتو",
24 |     "کون",
25 |     "کونی",
26 |     "کیر",
27 |     "گائیدم",
28 |     "گوزیده",
29 |     "کیرم"
30 | ]
31 | 
32 | INFORMAL = [
33 | 
34 | ]
35 | 
36 | OTHER = [
37 |     """
38 |     رخشندهٔ اعتصامی معروف به پروین اعتصامی (زاده ۲۵ اسفند ۱۲۸۵ در تبریز –
39 |     درگذشته ۱۵ فروردین ۱۳۲۰ در تهران) شاعر ایرانی بود گه از وی به عنوان
40 |     «مشهورترین شاعر زن ایران یاد شده است.» اعتصامی از کودکی فارسی، انگلیسی
41 |     و عربی را نزد پدرش آموخت و از همان کودکی تحت نظر پدرش و استادانی
42 |     چون دهخدا و ملک الشعرای بهار سرودن شعر را آغاز کرد. پدر وی یوسف اعتصامی،
43 |     از شاعران و مترجمان معاصر ایرانی بود که در شکل‌گیری زندگی هنری پروین و کشف
44 |     استعداد و گرایش وی به سرودن شعر نقش مهمی داشت.
45 |     """
46 | ]
47 | 
48 | r_text = revision_oriented.revision.text
49 | 
50 | 
51 | def test_badwords():
52 |     compare_extraction(persian.badwords.revision.datasources.matches,
53 |                        BAD, OTHER)
54 | 
55 |     assert persian.badwords == pickle.loads(pickle.dumps(persian.badwords))
56 | 
57 | 
58 | def test_informals():
59 |     compare_extraction(persian.informals.revision.datasources.matches,
60 |                        INFORMAL, OTHER)
61 | 
62 |     assert persian.informals == pickle.loads(pickle.dumps(persian.informals))
63 | 
64 | 
65 | def test_dictionary():
66 |     cache = {r_text: "خشندهٔ اعتصامی معروف به پروین اعتصامی (زاده ۲"}
67 |     assert (solve(persian.dictionary.revision.datasources.dict_words,
68 |                   cache=cache) ==
69 |             ['معروف', 'به', 'پروین', 'زاده'])
70 |     assert (solve(persian.dictionary.revision.datasources.non_dict_words,
71 |                   cache=cache) ==
72 |             ['خشندهٔ', 'اعتصامی', 'اعتصامی'])
73 | 
74 |     assert (persian.dictionary ==
75 |             pickle.loads(pickle.dumps(persian.dictionary)))
76 | 
77 | 
78 | def test_stopwords():
79 |     cache = {r_text: "خشندهٔ اعتصامی معروف به پروین اعتصامی (زاده ۲"}
80 |     assert (solve(persian.stopwords.revision.datasources.stopwords, cache=cache) ==
81 |             ['معروف'])
82 |     assert (solve(persian.stopwords.revision.datasources.non_stopwords,
83 |                   cache=cache) ==
84 |             ['خشندهٔ', 'اعتصامی', 'به', 'پروین', 'اعتصامی', 'زاده'])
85 | 
86 |     assert persian.stopwords == pickle.loads(pickle.dumps(persian.stopwords))
87 | 


--------------------------------------------------------------------------------
/tests/languages/test_finnish.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | from revscoring.datasources import revision_oriented
 4 | from revscoring.dependencies import solve
 5 | from revscoring.languages import finnish
 6 | 
 7 | from .util import compare_extraction
 8 | 
 9 | BAD = [
10 |     "homo", "homoja", "homot",
11 |     "hintti",
12 |     "homppeli",
13 |     "huora",
14 |     "idiootti",
15 |     "jumalauta",
16 |     "juntti",
17 |     "kakka", "kakkaa",
18 |     "kikkeli",
19 |     "kyrpä",
20 |     "kulli",
21 |     "kusi", "kusipää",
22 |     "läski",
23 |     "mamu",
24 |     "matu",
25 |     "neekeri",
26 |     "nussii",
27 |     "narttu",
28 |     "paska", "paskaa", "paskat", "paskin", "paskova",
29 |     "pelle",
30 |     "perse", "perseeseen", "perseessä", "perseestä", "perseenreikä",
31 |     "perkele",
32 |     "pillu", "pilluun",
33 |     "pippeli",
34 |     "pieru",
35 |     "retardi",
36 |     "runkkari",
37 |     "saatana", "saatanan",
38 |     "tyhmä",
39 |     "vammane", "vammanen",
40 |     "vittu",
41 |     "vitun",
42 |     "äpärä"
43 | ]
44 | 
45 | INFORMAL = [
46 |     "haistakaa",
47 |     "imekää",
48 |     "lol",
49 |     "ootte",
50 |     "moi",
51 |     "hei",
52 |     "sinä",
53 |     "sä",
54 |     "minä",
55 |     "mää",
56 |     "ok",
57 |     "joo",
58 |     "okei"
59 | ]
60 | 
61 | OTHER = [
62 |     """
63 |     Gunnar Nordström (12. maaliskuuta 1881 Helsinki – 24. joulukuuta 1923
64 |     Helsinki) oli suomalainen fyysikko ja avaruustähtitieteilijä. Hänet
65 |     tunnetaan erityisesti painovoimateoriastaan, joka oli yleistä
66 |     suhteellisuusteoriaa edeltävä kilpaileva teoria. Nordström on saanut
67 |     melko paljon huomiota ulkomailla, mutta kotimaassaan hän on melko
68 |     tuntematon henkilö.
69 |     """
70 | ]
71 | 
72 | r_text = revision_oriented.revision.text
73 | 
74 | 
75 | def test_badwords():
76 |     compare_extraction(finnish.badwords.revision.datasources.matches,
77 |                        BAD, OTHER)
78 | 
79 |     assert finnish.badwords == pickle.loads(pickle.dumps(finnish.badwords))
80 | 
81 | 
82 | def test_informals():
83 |     compare_extraction(finnish.informals.revision.datasources.matches,
84 |                        INFORMAL, OTHER)
85 | 
86 |     assert finnish.informals == pickle.loads(pickle.dumps(finnish.informals))
87 | 
88 | 
89 | def test_stopwords():
90 |     cache = {revision_oriented.revision.text: "Nordström on ette melko " +
91 |                                               "paljon huomiota"}
92 |     assert (solve(finnish.stopwords.revision.datasources.stopwords,
93 |             cache=cache) == ["on", "ette"])
94 |     assert (solve(finnish.stopwords.revision.datasources.non_stopwords,
95 |             cache=cache) == ['Nordström', 'melko', 'paljon', 'huomiota'])
96 | 
97 |     assert finnish.stopwords == pickle.loads(pickle.dumps(finnish.stopwords))
98 | 


--------------------------------------------------------------------------------
/revscoring/languages/hebrew.py:
--------------------------------------------------------------------------------
  1 | from .features import Dictionary, RegexMatches
  2 | 
  3 | name = "hebrew"
  4 | 
  5 | try:
  6 |     import enchant
  7 |     dictionary = enchant.Dict("he")
  8 | except enchant.errors.DictNotFoundError:
  9 |     raise ImportError("No enchant-compatible dictionary found for 'he'.  " +
 10 |                       "Consider installing 'myspell-he'.")
 11 | 
 12 | dictionary = Dictionary(name + ".dictionary", dictionary.check)
 13 | """
 14 | :class:`~revscoring.languages.features.Dictionary` features via
 15 | `enchant.Dict <https://github.com/rfk/pyenchant>`_ "he".  Provided by `myspell-he`
 16 | """
 17 | 
 18 | badword_regexes = [
 19 |     r"ה?קא?ק(י|ות|ה)",
 20 |     r"ה?חרא",
 21 |     r"חארות",
 22 |     r"[למ]חרבן",
 23 |     r"פיפי",
 24 |     r"(ב|ל|מ?ה)תחת",
 25 |     r"סקס",
 26 |     r"ה?זין",
 27 |     r"ציצים?",
 28 |     r"ה?בולבול(?:ים)?",
 29 |     r"זיו(ן|נים)",
 30 |     r"[מת]זד?יי(ן|נת|נים|נות|נו)",
 31 |     r"להזדיין",
 32 |     r"לזיין",
 33 |     r"למצוץ",
 34 |     r"מוצ(ץ|צת)",
 35 |     r"שפיך",
 36 |     r"ה?דפוק(ה|ים)?",
 37 |     r"ה?הומו",
 38 |     r"ה?גיי",
 39 |     r"קוקסינל",
 40 |     r"סקסי",
 41 |     r"יבני",
 42 |     r"ה?זונ(ה|ות)",
 43 |     r"בנזונה",
 44 |     r"שרמוט(ה|ות)",
 45 |     r"ה?מניאק",
 46 |     r"ה?מטומט(ם|מת|מי)",
 47 |     r"דביל(?:ים)?",
 48 |     r"טמבל",
 49 |     r"מפגר(?:ים)?",
 50 |     r"ה?מהממת",
 51 |     r"ה?כוס(ון|ית|יות)",
 52 |     r"אחושרמוטה",
 53 |     r"ה?פלו(ץ|צים)",
 54 |     r"[המ]פלי(ץ|צה)",
 55 |     r"להפליץ",
 56 |     r"מסריח(ים|ה)?",
 57 |     r"מגעיל",
 58 |     r"נוד",
 59 |     r"שטויות",
 60 |     r"היוש",
 61 |     r"חיימשלי",
 62 |     r"כאפות",
 63 |     r"כפרע",
 64 |     r"דגכ",
 65 |     r"זובי"
 66 | ]
 67 | 
 68 | badwords = RegexMatches(name + ".badwords", badword_regexes)
 69 | """
 70 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
 71 | badword detecting regexes.
 72 | """
 73 | 
 74 | informal_regexes = [
 75 |     r"חחח+",
 76 |     r"[בה]ייי",
 77 |     r"פהה+",
 78 |     r"מכוערת?",
 79 |     r"מעפ(ן|נה)",
 80 |     r"ו?חתיך",
 81 |     r"אחלה",
 82 |     r"ה?חמוד(ה|ים)?",
 83 |     r"יאלל?ה",
 84 |     r"טעים",
 85 |     r"(בלה)+",
 86 |     r"סתם",
 87 |     r"כנסו",
 88 |     r"אות(כם|ך+)",
 89 |     r"שתדע",
 90 |     r"תהנו",
 91 |     r"לכו",
 92 |     r"לכם",
 93 |     r"בגללך",
 94 |     r"עליי",
 95 |     r"של(יי|כם|ך)",
 96 |     r"תיכנסו",
 97 |     r"אתם",
 98 |     r"אוהבת",
 99 |     r"מגניב",
100 |     r"כיף",
101 |     r"הדגדגנים",
102 |     r"חזיות",
103 |     r"[בל]פורנוגרפיה",
104 |     r"משו?עמ(מים|ם)",
105 |     r"אהה",
106 |     r"יימח"
107 | ]
108 | 
109 | informals = RegexMatches(name + ".informals", informal_regexes)
110 | """
111 | :class:`~revscoring.languages.features.RegexMatches` features via a list of
112 | informal word detecting regexes.
113 | """
114 | 


--------------------------------------------------------------------------------
/revscoring/utilities/fit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ``revscoring fit -h``
 3 | ::
 4 | 
 5 |     Fits a dependent (an extractable value like a Datasource or Feature) to
 6 |     observed data.  These are often used along with bag-of-words
 7 |     methods to reduce the feature space prior to training and testing a model
 8 |     or to train a sub-model.
 9 | 
10 |     Usage:
11 |         fit -h | --help
12 |         fit <dependent> <label>
13 |             [--input=<path>]
14 |             [--datasource-file=<path>]
15 |             [--debug]
16 | 
17 |     Options:
18 |         -h --help                 Prints this documentation
19 |         <dependent>               The classpath to `Dependent`
20 |                                   that can be fit to observations
21 |         <label>                   The label that should be predicted
22 |         --input=<path>            Path to a file containing observations
23 |                                   [default: <stdin>]
24 |         --datasource-file=<math>  Path to a file for writing out the trained
25 |                                   datasource [default: <stdout>]
26 |         --debug                   Print debug logging.
27 | """  # noqa
28 | import logging
29 | import sys
30 | 
31 | import docopt
32 | import yamlconf
33 | 
34 | from ..dependencies import solve
35 | from .util import read_observations
36 | 
37 | logger = logging.getLogger(__name__)
38 | 
39 | 
40 | def main(argv=None):
41 |     args = docopt.docopt(__doc__, argv=argv)
42 | 
43 |     logging.basicConfig(
44 |         level=logging.INFO if not args['--debug'] else logging.DEBUG,
45 |         format='%(asctime)s %(levelname)s:%(name)s -- %(message)s'
46 |     )
47 | 
48 |     dependent = yamlconf.import_path(args['<dependent>'])
49 | 
50 |     label_name = args['<label>']
51 | 
52 |     if args['--input'] == "<stdin>":
53 |         observations = read_observations(sys.stdin)
54 |     else:
55 |         observations = read_observations(open(args['--input']))
56 | 
57 |     logger.info("Reading observations...")
58 |     value_labels = [
59 |         (list(solve(dependent.dependencies, cache=ob['cache'])),
60 |          ob[label_name])
61 |         for ob in observations]
62 |     logger.debug(" -- {0} observations gathered".format(len(value_labels)))
63 | 
64 |     if args['--datasource-file'] == "<stdout>":
65 |         datasource_f = sys.stdout
66 |     else:
67 |         datasource_f = open(args['--datasource-file'], 'w')
68 | 
69 |     debug = args['--debug']
70 | 
71 |     run(dependent, label_name, value_labels, datasource_f, debug)
72 | 
73 | 
74 | def run(dependent, label_name, value_labels, datasource_f, debug):
75 |     logger.info("Fitting {0} ({1})".format(dependent, type(dependent)))
76 |     dependent.fit(value_labels)
77 | 
78 |     logger.info("Writing fitted selector to {0}".format(datasource_f))
79 |     dependent.dump(datasource_f)
80 | 


--------------------------------------------------------------------------------
/tests/scoring/statistics/classification/tests/test_rates.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from revscoring.scoring.statistics.classification.counts import Counts
 4 | from revscoring.scoring.statistics.classification.rates import Rates
 5 | 
 6 | COUNTS = Counts(
 7 |     [True, False],
 8 |     [({'prediction': True}, True)] * 10 +
 9 |     [({'prediction': True}, False)] * 20 +
10 |     [({'prediction': False}, False)] * 30 +
11 |     [({'prediction': False}, True)] * 40,
12 |     'prediction')
13 | 
14 | 
15 | def test_simple():
16 |     r = Rates(COUNTS)
17 | 
18 |     print(r.format_str({}))
19 |     print(json.dumps(r.format_json({}), indent=2))
20 |     assert r.lookup("sample.true") == 0.5
21 |     assert r.lookup("sample.false") == 0.5
22 | 
23 | 
24 | def test_population():
25 |     r = Rates(COUNTS, population_rates={True: 0.05, False: 0.95})
26 |     print(r.format_str({}))
27 |     print(json.dumps(r.format_json({}), indent=2))
28 |     assert r.lookup("sample.true") == 0.5
29 |     assert r.lookup("sample.false") == 0.5
30 |     assert r.lookup("population.true") == 0.05
31 |     assert r.lookup("population.false") == 0.95
32 | 
33 | 
34 | def test_wide_labels():
35 |     long_counts = Counts(
36 |         ["I have a bunch of characters",
37 |          "And I'm going to take up way too much space",
38 |          "This is definitely going to be too long"],
39 |         [({'prediction': "I have a bunch of characters"},
40 |           "I have a bunch of characters")] * 30 +
41 |         [({'prediction': "I have a bunch of characters"},
42 |           "And I'm going to take up way too much space")] * 2 +
43 |         [({'prediction': "I have a bunch of characters"},
44 |           "This is definitely going to be too long")] * 1 +
45 |         [({'prediction': "And I'm going to take up way too much space"},
46 |           "And I'm going to take up way too much space")] * 25 +
47 |         [({'prediction': "And I'm going to take up way too much space"},
48 |           "I have a bunch of characters")] * 5 +
49 |         [({'prediction': "And I'm going to take up way too much space"},
50 |           "This is definitely going to be too long")] * 6 +
51 |         [({'prediction': "This is definitely going to be too long"},
52 |           "This is definitely going to be too long")] * 35 +
53 |         [({'prediction': "This is definitely going to be too long"},
54 |           "I have a bunch of characters")] * 1 +
55 |         [({'prediction': "This is definitely going to be too long"},
56 |           "And I'm going to take up way too much space")] * 1,
57 |         'prediction')
58 |     r = Rates(long_counts, population_rates={
59 |         "I have a bunch of characters": 0.45,
60 |          "And I'm going to take up way too much space": 0.332,
61 |          "This is definitely going to be too long": 0.25
62 |     })
63 |     print(r.format_str({}))
64 |     lines = r.format_str({}).split("\n")
65 |     for line in lines:
66 |         assert len(line) < 80
67 | 


--------------------------------------------------------------------------------