├── .gitattributes ├── .github ├── CODEOWNERS ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── performance_issue.md ├── actions │ └── install-env │ │ └── action.yml ├── pull_request_template.md └── workflows │ ├── code-quality.yml │ ├── delete-caches.yml │ ├── dev-docs.yml │ ├── pypi.yml │ ├── release-docs.yml │ └── unit-tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.bib ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── benchmarks ├── Batch versus online.ipynb ├── Factorization machines.ipynb ├── README.md ├── binary_classification.csv ├── config.py ├── details.json ├── model_adapters │ ├── __init__.py │ └── vw.py ├── multiclass_classification.csv ├── regression.csv ├── render.py └── run.py ├── build.py ├── docs ├── .pages ├── CNAME ├── benchmarks │ ├── .pages │ ├── Binary classification │ │ ├── binary_classification.csv │ │ └── index.md │ ├── Multiclass classification │ │ ├── index.md │ │ └── multiclass_classification.csv │ └── Regression │ │ ├── index.md │ │ └── regression.csv ├── css │ └── version-select.css ├── examples │ ├── .pages │ ├── batch-to-online.ipynb │ ├── bike-sharing-forecasting.ipynb │ ├── building-a-simple-nowcasting-model.ipynb │ ├── content-personalization.ipynb │ ├── debugging-a-pipeline.ipynb │ ├── imbalanced-learning.ipynb │ ├── matrix-factorization-for-recommender-systems │ │ ├── .pages │ │ ├── part-1.ipynb │ │ ├── part-2.ipynb │ │ └── part-3.ipynb │ ├── quantile-regression-uncertainty.ipynb │ ├── sentence-classification.ipynb │ ├── sentence_classification_files │ │ ├── sentence_classification_14_0.svg │ │ ├── sentence_classification_19_0.svg │ │ └── sentence_classification_31_0.svg │ └── the-art-of-using-pipelines.ipynb ├── faq │ ├── .pages │ └── index.md ├── img │ ├── dtree_draw.svg │ ├── favicon.ico │ ├── favicon_old.ico │ ├── histogram_docstring.svg │ ├── icon.png │ ├── illustration.png │ ├── illustration_old.png │ ├── logo.svg │ ├── logo_white.svg │ ├── online_active_learning.png │ ├── pipeline_docstring.svg │ └── skyline_docstring.svg ├── index.md ├── introduction │ ├── .pages │ ├── basic-concepts.md │ ├── getting-started │ │ ├── binary-classification.ipynb │ │ ├── concept-drift-detection.ipynb │ │ ├── concept-drift-detection_files │ │ │ ├── concept-drift-detection_1_0.png │ │ │ └── concept-drift-detection_3_1.png │ │ ├── multiclass-classification.ipynb │ │ └── regression.ipynb │ ├── installation.md │ ├── next-steps.md │ ├── related-projects.md │ └── why-use-river.md ├── javascripts │ ├── config.js │ └── tablesort.js ├── js │ └── version-select.js ├── license │ ├── .pages │ └── license.md ├── overrides │ ├── home.html │ └── partials │ │ ├── footer.html │ │ └── integrations │ │ └── analytics.html ├── parse │ └── __main__.py ├── recipes │ ├── .pages │ ├── active-learning.ipynb │ ├── bandits-101.ipynb │ ├── cloning-and-mutating.ipynb │ ├── feature-extraction.ipynb │ ├── hyperparameter-tuning.ipynb │ ├── mini-batching.ipynb │ ├── model-evaluation.ipynb │ ├── on-hoeffding-trees.ipynb │ ├── on-hoeffding-trees_files │ │ ├── on-hoeffding-trees_12_0.svg │ │ ├── on-hoeffding-trees_21_0.png │ │ ├── on-hoeffding-trees_23_0.png │ │ ├── on-hoeffding-trees_25_0.png │ │ ├── on-hoeffding-trees_27_0.png │ │ └── on-hoeffding-trees_29_0.png │ ├── pipelines.ipynb │ ├── pipelines_files │ │ ├── pipelines_18_0.svg │ │ └── pipelines_8_0.svg │ ├── reading-data.ipynb │ └── rolling-computations.ipynb ├── releases │ ├── .pages │ ├── 0.0.2.md │ ├── 0.0.3.md │ ├── 0.1.0.md │ ├── 0.10.0.md │ ├── 0.10.1.md │ ├── 0.11.0.md │ ├── 0.11.1.md │ ├── 0.12.0.md │ ├── 0.12.1.md │ ├── 0.13.0.md │ ├── 0.14.0.md │ ├── 0.15.0.md │ ├── 0.16.0.md │ ├── 0.17.0.md │ ├── 0.18.0.md │ ├── 0.19.0.md │ ├── 0.2.0.md │ ├── 0.20.0.md │ ├── 0.20.1.md │ ├── 0.21.0.md │ ├── 0.21.1.md │ ├── 0.21.2.md │ ├── 0.22.0.md │ ├── 0.3.0.md │ ├── 0.4.1.md │ ├── 0.4.3.md │ ├── 0.4.4.md │ ├── 0.5.0.md │ ├── 0.5.1.md │ ├── 0.6.0.md │ ├── 0.6.1.md │ ├── 0.7.0.md │ ├── 0.7.1.md │ ├── 0.8.0.md │ ├── 0.9.0.md │ └── unreleased.md └── stylesheets │ └── extra.css ├── mkdocs.yml ├── poetry.lock ├── pyproject.toml ├── river ├── __init__.py ├── __version__.py ├── active │ ├── __init__.py │ ├── base.py │ └── entropy.py ├── anomaly │ ├── __init__.py │ ├── base.py │ ├── filter.py │ ├── gaussian.py │ ├── hst.py │ ├── lof.py │ ├── pad.py │ ├── sad.py │ ├── svm.py │ ├── test_hst.py │ ├── test_lof.py │ └── test_svm.py ├── api.py ├── bandit │ ├── __init__.py │ ├── base.py │ ├── bayes_ucb.py │ ├── datasets │ │ ├── __init__.py │ │ ├── base.py │ │ └── news.py │ ├── envs │ │ ├── __init__.py │ │ ├── candy_cane.py │ │ └── testbed.py │ ├── epsilon_greedy.py │ ├── evaluate.py │ ├── exp3.py │ ├── lin_ucb.py │ ├── random.py │ ├── test_envs.py │ ├── test_policies.py │ ├── thompson.py │ └── ucb.py ├── base │ ├── __init__.py │ ├── base.py │ ├── classifier.py │ ├── clusterer.py │ ├── drift_detector.py │ ├── ensemble.py │ ├── estimator.py │ ├── multi_output.py │ ├── regressor.py │ ├── tags.py │ ├── test_base.py │ ├── transformer.py │ ├── typing.py │ ├── viz.py │ └── wrapper.py ├── checks │ ├── __init__.py │ ├── anomaly.py │ ├── clf.py │ ├── common.py │ ├── model_selection.py │ ├── reco.py │ └── utils.py ├── cluster │ ├── __init__.py │ ├── clustream.py │ ├── dbstream.py │ ├── denstream.py │ ├── k_means.py │ ├── odac.py │ ├── streamkmeans.py │ ├── test_dbstream.py │ └── textclust.py ├── compat │ ├── __init__.py │ ├── river_to_sklearn.py │ ├── sklearn_to_river.py │ └── test_sklearn.py ├── compose │ ├── __init__.py │ ├── func.py │ ├── grouper.py │ ├── pipeline.py │ ├── product.py │ ├── renamer.py │ ├── select.py │ ├── target_transform.py │ ├── test_.py │ ├── test_product.py │ └── union.py ├── conf │ ├── __init__.py │ ├── interval.py │ └── jackknife.py ├── conftest.py ├── covariance │ ├── __init__.py │ ├── emp.py │ └── test_emp.py ├── datasets │ ├── __init__.py │ ├── airline-passengers.csv │ ├── airline_passengers.py │ ├── banana.zip │ ├── bananas.py │ ├── base.py │ ├── bikes.py │ ├── chick-weights.csv │ ├── chick_weights.py │ ├── credit_card.py │ ├── elec2.py │ ├── higgs.py │ ├── http.py │ ├── insects.py │ ├── keystroke.py │ ├── malicious_url.py │ ├── movielens100k.py │ ├── music.py │ ├── phishing.csv.gz │ ├── phishing.py │ ├── restaurants.py │ ├── segment.csv.zip │ ├── segment.py │ ├── sms_spam.py │ ├── smtp.py │ ├── solar-flare.csv.zip │ ├── solar_flare.py │ ├── synth │ │ ├── __init__.py │ │ ├── agrawal.py │ │ ├── anomaly_sine.py │ │ ├── concept_drift_stream.py │ │ ├── friedman.py │ │ ├── hyper_plane.py │ │ ├── led.py │ │ ├── logical.py │ │ ├── mixed.py │ │ ├── mv.py │ │ ├── planes_2d.py │ │ ├── random_rbf.py │ │ ├── random_tree.py │ │ ├── sea.py │ │ ├── sine.py │ │ ├── stagger.py │ │ └── waveform.py │ ├── taxis.py │ ├── test_datasets.py │ ├── trec07.py │ ├── trump_approval.csv.gz │ ├── trump_approval.py │ ├── water-flow.csv │ ├── water_flow.py │ └── web_traffic.py ├── drift │ ├── __init__.py │ ├── adwin.py │ ├── adwin_c.pyi │ ├── adwin_c.pyx │ ├── binary │ │ ├── __init__.py │ │ ├── ddm.py │ │ ├── eddm.py │ │ ├── fhddm.py │ │ ├── hddm_a.py │ │ └── hddm_w.py │ ├── datasets │ │ ├── __init__.py │ │ ├── airline_passengers.csv │ │ ├── airline_passengers.py │ │ ├── apple.csv │ │ ├── apple.py │ │ ├── base.py │ │ ├── bitcoin.csv │ │ ├── bitcoin.py │ │ ├── brent_crude_oil.csv │ │ ├── brent_crude_oil.py │ │ ├── occupancy.csv │ │ ├── occupancy.py │ │ ├── run_log.csv │ │ ├── run_log.py │ │ ├── uk_coal_employment.csv │ │ └── uk_coal_employment.py │ ├── dummy.py │ ├── kswin.py │ ├── no_drift.py │ ├── page_hinkley.py │ ├── retrain.py │ └── test_drift_detectors.py ├── dummy.py ├── ensemble │ ├── __init__.py │ ├── bagging.py │ ├── boosting.py │ ├── ewa.py │ ├── stacking.py │ ├── streaming_random_patches.py │ └── voting.py ├── evaluate │ ├── __init__.py │ ├── gen.py │ ├── progressive_validation.py │ └── tracks.py ├── facto │ ├── __init__.py │ ├── base.py │ ├── ffm.py │ ├── fm.py │ ├── fwfm.py │ └── hofm.py ├── feature_extraction │ ├── __init__.py │ ├── agg.py │ ├── kernel_approx.py │ ├── poly.py │ ├── test_agg.py │ ├── test_vectorize.py │ └── vectorize.py ├── feature_selection │ ├── __init__.py │ ├── k_best.py │ ├── random.py │ └── variance.py ├── forest │ ├── __init__.py │ ├── adaptive_random_forest.py │ ├── aggregated_mondrian_forest.py │ ├── online_extra_trees.py │ └── test_amf.py ├── imblearn │ ├── __init__.py │ ├── chebyshev.py │ ├── hard_sampling.py │ └── random.py ├── linear_model │ ├── __init__.py │ ├── alma.py │ ├── base.py │ ├── bayesian_lin_reg.py │ ├── lin_reg.py │ ├── log_reg.py │ ├── pa.py │ ├── perceptron.py │ ├── softmax.py │ └── test_glm.py ├── metrics │ ├── __init__.py │ ├── accuracy.py │ ├── balanced_accuracy.py │ ├── base.py │ ├── confusion.py │ ├── cross_entropy.py │ ├── efficient_rollingrocauc │ │ ├── __init__.py │ │ ├── cpp │ │ │ ├── RollingROCAUC.cpp │ │ │ └── RollingROCAUC.hpp │ │ ├── efficient_rollingrocauc.pxd │ │ ├── efficient_rollingrocauc.pyi │ │ └── efficient_rollingrocauc.pyx │ ├── expected_mutual_info.pyi │ ├── expected_mutual_info.pyx │ ├── fbeta.py │ ├── fowlkes_mallows.py │ ├── geometric_mean.py │ ├── jaccard.py │ ├── kappa.py │ ├── log_loss.py │ ├── mae.py │ ├── mape.py │ ├── mcc.py │ ├── mse.py │ ├── multioutput │ │ ├── __init__.py │ │ ├── base.py │ │ ├── confusion.py │ │ ├── exact_match.py │ │ ├── macro.py │ │ ├── micro.py │ │ ├── per_output.py │ │ ├── sample_average.py │ │ └── test_multioutput_metrics.py │ ├── mutual_info.py │ ├── precision.py │ ├── r2.py │ ├── rand.py │ ├── recall.py │ ├── report.py │ ├── roc_auc.py │ ├── rolling_roc_auc.py │ ├── silhouette.py │ ├── smape.py │ ├── test_confusion.py │ ├── test_cross_entropy.py │ ├── test_fbeta.py │ ├── test_log_loss.py │ ├── test_metrics.py │ ├── test_r2.py │ └── vbeta.py ├── misc │ ├── __init__.py │ ├── sdft.py │ └── skyline.py ├── model_selection │ ├── __init__.py │ ├── bandit.py │ ├── base.py │ ├── greedy.py │ ├── sh.py │ └── test_bandit.py ├── multiclass │ ├── __init__.py │ ├── occ.py │ ├── ovo.py │ ├── ovr.py │ └── test_ovr.py ├── multioutput │ ├── __init__.py │ ├── chain.py │ └── encoder.py ├── naive_bayes │ ├── __init__.py │ ├── base.py │ ├── bernoulli.py │ ├── complement.py │ ├── gaussian.py │ ├── multinomial.py │ └── test_naive_bayes.py ├── neighbors │ ├── __init__.py │ ├── ann │ │ ├── __init__.py │ │ ├── nn_vertex.py │ │ └── swinn.py │ ├── base.py │ ├── knn_classifier.py │ ├── knn_regressor.py │ └── lazy.py ├── neural_net │ ├── __init__.py │ ├── activations.py │ └── mlp.py ├── optim │ ├── __init__.py │ ├── ada_bound.py │ ├── ada_delta.py │ ├── ada_grad.py │ ├── ada_max.py │ ├── adam.py │ ├── ams_grad.py │ ├── average.py │ ├── base.py │ ├── ftrl.py │ ├── initializers.py │ ├── losses.py │ ├── momentum.py │ ├── nadam.py │ ├── nesterov.py │ ├── newton.py │ ├── rms_prop.py │ ├── schedulers.py │ ├── sgd.py │ └── test_.py ├── preprocessing │ ├── __init__.py │ ├── feature_hasher.py │ ├── impute.py │ ├── lda.py │ ├── one_hot.py │ ├── ordinal.py │ ├── pred_clipper.py │ ├── random_projection.py │ ├── scale.py │ ├── scale_target.py │ ├── test_lda.py │ ├── test_random_projection.py │ └── test_scale.py ├── proba │ ├── __init__.py │ ├── base.py │ ├── beta.py │ ├── gaussian.py │ ├── multinomial.py │ └── test_gaussian.py ├── py.typed ├── reco │ ├── __init__.py │ ├── base.py │ ├── baseline.py │ ├── biased_mf.py │ ├── funk_mf.py │ └── normal.py ├── rules │ ├── __init__.py │ ├── amrules.py │ └── base.py ├── sketch │ ├── __init__.py │ ├── counter.py │ ├── heavy_hitters.py │ ├── histogram.py │ └── set.py ├── stats │ ├── __init__.py │ ├── _rust_stats.pyi │ ├── auto_corr.py │ ├── base.py │ ├── count.py │ ├── cov.py │ ├── entropy.py │ ├── ewmean.py │ ├── ewvar.py │ ├── iqr.py │ ├── kolmogorov_smirnov.py │ ├── kurtosis.py │ ├── link.py │ ├── mad.py │ ├── maximum.py │ ├── mean.py │ ├── minimum.py │ ├── mode.py │ ├── n_unique.py │ ├── pearson.py │ ├── ptp.py │ ├── quantile.py │ ├── sem.py │ ├── shift.py │ ├── skew.py │ ├── summing.py │ ├── test_kolmogorov_smirnov.py │ ├── test_parallel.py │ ├── test_quantile.py │ ├── test_stats.py │ ├── test_var.py │ └── var.py ├── stream │ ├── __init__.py │ ├── cache.py │ ├── iter_arff.py │ ├── iter_array.py │ ├── iter_csv.py │ ├── iter_libsvm.py │ ├── iter_pandas.py │ ├── iter_polars.py │ ├── iter_sklearn.py │ ├── iter_sql.py │ ├── iter_vaex.py │ ├── pokedb.zip │ ├── qa.py │ ├── shuffling.py │ ├── test_iter_csv.py │ ├── test_sql.py │ ├── tweet_stream.py │ ├── twitch_chat_stream.py │ └── utils.py ├── test_estimators.py ├── time_series │ ├── __init__.py │ ├── base.py │ ├── evaluate.py │ ├── holt_winters.py │ ├── metrics.py │ ├── snarimax.py │ ├── test_evaluate.py │ ├── test_holt_winters.py │ └── test_snarimax.py ├── tree │ ├── __init__.py │ ├── base.py │ ├── extremely_fast_decision_tree.py │ ├── hoeffding_adaptive_tree_classifier.py │ ├── hoeffding_adaptive_tree_regressor.py │ ├── hoeffding_tree.py │ ├── hoeffding_tree_classifier.py │ ├── hoeffding_tree_regressor.py │ ├── isoup_tree_regressor.py │ ├── last_classifier.py │ ├── losses.py │ ├── mondrian │ │ ├── __init__.py │ │ ├── mondrian_tree.py │ │ ├── mondrian_tree_classifier.py │ │ ├── mondrian_tree_nodes.py │ │ └── mondrian_tree_regressor.py │ ├── nodes │ │ ├── __init__.py │ │ ├── arf_htc_nodes.py │ │ ├── arf_htr_nodes.py │ │ ├── branch.py │ │ ├── efdtc_nodes.py │ │ ├── et_nodes.py │ │ ├── hatc_nodes.py │ │ ├── hatr_nodes.py │ │ ├── htc_nodes.py │ │ ├── htr_nodes.py │ │ ├── isouptr_nodes.py │ │ ├── last_nodes.py │ │ ├── leaf.py │ │ └── sgt_nodes.py │ ├── setup.py │ ├── split_criterion │ │ ├── __init__.py │ │ ├── base.py │ │ ├── gini_split_criterion.py │ │ ├── hellinger_distance_criterion.py │ │ ├── info_gain_split_criterion.py │ │ ├── intra_cluster_variance_reduction_split_criterion.py │ │ ├── variance_ratio_split_criterion.py │ │ └── variance_reduction_split_criterion.py │ ├── splitter │ │ ├── __init__.py │ │ ├── base.py │ │ ├── ebst_splitter.py │ │ ├── exhaustive_splitter.py │ │ ├── gaussian_splitter.py │ │ ├── histogram_splitter.py │ │ ├── nominal_splitter_classif.py │ │ ├── nominal_splitter_reg.py │ │ ├── qo_splitter.py │ │ ├── random_splitter.py │ │ ├── sgt_quantizer.py │ │ └── tebst_splitter.py │ ├── stochastic_gradient_tree.py │ ├── test_base.py │ ├── test_splitter.py │ ├── test_trees.py │ ├── utils.py │ └── viz.py └── utils │ ├── __init__.py │ ├── context_managers.py │ ├── inspect.py │ ├── math.py │ ├── norm.py │ ├── param_grid.py │ ├── pretty.py │ ├── random.py │ ├── rolling.py │ ├── sorted_window.py │ ├── test_math.py │ ├── test_param_grid.py │ ├── test_rolling.py │ ├── test_vectordict.py │ ├── vectordict.pyi │ └── vectordict.pyx └── rust_src └── lib.rs /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=nbstripout 2 | *.ipynb diff=ipynb 3 | *.ipynb linguist-detectable=false 4 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @MaxHalford @smastelini 2 | river/facto @gbolmier 3 | river/stats @AdilZouitine 4 | river/cluster @hoanganhngo610 @Dennis1989 5 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: MaxHalford 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us reproduce and correct the bug 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 14 | ## Versions 15 | 18 | 19 | **river version**: 20 | **Python version**: 21 | **Operating system**: 22 | 23 | ## Describe the bug 24 | 27 | 28 | ## Steps/code to reproduce 29 | 43 | 44 | ```python 45 | # Sample code to reproduce the problem 46 | # Please do your best to provide a Minimal, Reproducible Example: https://stackoverflow.com/help/minimal-reproducible-example 47 | ``` 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/performance_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Performance issue 3 | about: Provide a reproducible example to debug a performance issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Versions 11 | 14 | 15 | **`creme` version**: 16 | 17 | **Python version**: 18 | 19 | ## Describe your task 20 | 23 | 24 | ## What kind of performance are you expecting? 25 | 28 | 29 | ## Steps/code to reproduce 30 | 31 | 34 | 35 | ```python 36 | # Sample code to reproduce the performance issue 37 | ``` 38 | 39 | ## Necessary data 40 | 41 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /.github/actions/install-env/action.yml: -------------------------------------------------------------------------------- 1 | name: Install env 2 | 3 | inputs: 4 | python-version: 5 | description: "Python version to use" 6 | required: true 7 | build-root: 8 | default: "true" 9 | options: 10 | - true 11 | - false 12 | 13 | runs: 14 | using: "composite" 15 | steps: 16 | - name: Check out repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | id: set-up-python 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ inputs.python-version }} 24 | 25 | # Getting errors since using not the latest Python version in docs workflows 26 | # - name: Load cached Poetry installation 27 | # uses: actions/cache@v4 28 | # with: 29 | # path: ~/.local # the path depends on the OS 30 | # key: poetry-3 # modify to reset cache 31 | 32 | - name: Install poetry 33 | uses: snok/install-poetry@v1 34 | with: 35 | virtualenvs-create: true 36 | virtualenvs-in-project: true 37 | installer-parallel: true 38 | 39 | - name: Load cached virtual env 40 | id: cached-poetry-dependencies 41 | uses: actions/cache@v4 42 | with: 43 | path: .venv 44 | key: venv-${{ runner.os }}-${{ steps.set-up-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} 45 | 46 | - name: Install dependencies 47 | shell: bash 48 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 49 | run: poetry install --no-interaction --no-ansi --no-root 50 | 51 | - name: Build 52 | shell: bash 53 | if: ${{ inputs.build-root == 'true' }} 54 | run: poetry install --no-interaction --no-ansi 55 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 10 | -------------------------------------------------------------------------------- /.github/workflows/code-quality.yml: -------------------------------------------------------------------------------- 1 | name: code-quality 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - "*" 7 | push: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | ubuntu: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Build River 18 | uses: ./.github/actions/install-env 19 | with: 20 | python-version: "3.13" 21 | build-root: false 22 | 23 | - name: MyPy type check 24 | run: poetry run mypy 25 | 26 | - name: Ruff code linting 27 | run: poetry run ruff check --output-format=github river/ 28 | 29 | - name: Ruff code formatting 30 | run: poetry run ruff format --check river/ 31 | -------------------------------------------------------------------------------- /.github/workflows/delete-caches.yml: -------------------------------------------------------------------------------- 1 | name: Clear all Github Actions caches 2 | on: 3 | workflow_dispatch: 4 | schedule: 5 | - cron: "0 0 * * 0" 6 | 7 | jobs: 8 | my-job: 9 | name: Delete all caches 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Clear caches 14 | uses: easimon/wipe-cache@main 15 | -------------------------------------------------------------------------------- /.github/workflows/dev-docs.yml: -------------------------------------------------------------------------------- 1 | name: dev-docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Build River 16 | uses: ./.github/actions/install-env 17 | with: 18 | # Use 3.12 for the docs env waiting for spaCy and srsly to support 3.13 19 | python-version: "3.12" 20 | build-root: false 21 | 22 | - name: Install extra Ubuntu dependencies 23 | run: sudo apt-get install graphviz pandoc 24 | 25 | - name: Install extra Python dependencies 26 | run: | 27 | poetry install --with docs 28 | 29 | - name: Build docs 30 | run: | 31 | source $VENV 32 | make doc 33 | 34 | - name: Deploy docs 35 | env: 36 | GH_TOKEN: ${{ secrets.GitHubToken }} 37 | run: | 38 | source $VENV 39 | git config user.name github-actions 40 | git config user.email github-actions@github.com 41 | git config pull.rebase false 42 | git add --all 43 | git commit -m "Release dev docs" --allow-empty 44 | git fetch 45 | git checkout gh-pages 46 | git pull 47 | git checkout main 48 | mike deploy dev --push --remote https://github.com/${{ github.repository }}.git 49 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yml: -------------------------------------------------------------------------------- 1 | name: unit-tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - "*" 7 | push: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | run: 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest] 16 | python-version: ["3.13", "3.12", "3.11", "3.10"] 17 | 18 | runs-on: ${{ matrix.os }} 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - name: Build River 24 | uses: ./.github/actions/install-env 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Cache River datasets 29 | uses: actions/cache@v4 30 | with: 31 | path: ~/river_data 32 | key: ${{ runner.os }} 33 | 34 | - name: Cache scikit-learn datasets 35 | uses: actions/cache@v4 36 | with: 37 | path: ~/scikit_learn_data 38 | key: ${{ runner.os }} 39 | 40 | - name: Download datasets 41 | run: | 42 | poetry run python -c "from river import datasets; datasets.CreditCard().download(); datasets.Elec2().download(); datasets.SMSSpam().download()" 43 | poetry run python -c "from river import bandit; bandit.datasets.NewsArticles().download()" 44 | 45 | - name: pytest 46 | run: | 47 | poetry run pytest -m "not datasets" --durations=10 -n logical # Run pytest on all logical CPU cores 48 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | files: river 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.4.0 5 | hooks: 6 | - id: check-json 7 | - id: check-yaml 8 | 9 | - repo: https://github.com/astral-sh/ruff-pre-commit 10 | # Ruff version, should be the same as in poetry.lock 11 | rev: v0.4.10 12 | hooks: 13 | # Run the linter. 14 | - id: ruff 15 | types_or: [python, pyi, jupyter] 16 | args: [--fix] 17 | # Run the formatter. 18 | - id: ruff-format 19 | types_or: [python, pyi, jupyter] 20 | 21 | - repo: https://github.com/pre-commit/mirrors-mypy 22 | # MyPy version, should be the same as in poetry.lock 23 | rev: v1.13.0 24 | hooks: 25 | - id: mypy 26 | args: 27 | - "--config-file=pyproject.toml" 28 | - "--python-version=3.11" 29 | - "--implicit-optional" 30 | -------------------------------------------------------------------------------- /CITATION.bib: -------------------------------------------------------------------------------- 1 | @article{montiel2021river, 2 | title={River: machine learning for streaming data in Python}, 3 | author={Montiel, Jacob and Halford, Max and Mastelini, Saulo Martiello and Bolmier, Geoffrey and Sourty, Raphael and Vaysse, Robin and Zouitine, Adil and Gomes, Heitor Murilo and Read, Jesse and Abdessalem, Talel and others}, 4 | year={2021} 5 | } 6 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "river" 3 | version = "0.1.0" 4 | authors = ["Adil Zouitine "] 5 | edition = "2021" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | [lib] 9 | name = "river" 10 | path = "rust_src/lib.rs" 11 | crate-type = ["cdylib"] 12 | 13 | [dependencies] 14 | pyo3 = { version = "0.23.1", features = ["extension-module"] } 15 | watermill = "0.1.1" 16 | bincode = "1.3.3" 17 | serde = { version = "1.0", features = ["derive"] } 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2020, the river developers 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | COMMIT_HASH := $(shell eval git rev-parse HEAD) 2 | 3 | format: 4 | pre-commit run --all-files 5 | 6 | execute-notebooks: 7 | jupyter nbconvert --execute --to notebook --inplace docs/introduction/*/*.ipynb --ExecutePreprocessor.timeout=-1 8 | jupyter nbconvert --execute --to notebook --inplace docs/recipes/*.ipynb --ExecutePreprocessor.timeout=-1 9 | jupyter nbconvert --execute --to notebook --inplace docs/examples/*.ipynb --ExecutePreprocessor.timeout=-1 10 | jupyter nbconvert --execute --to notebook --inplace docs/examples/*/*.ipynb --ExecutePreprocessor.timeout=-1 11 | 12 | render-notebooks: 13 | jupyter nbconvert --to markdown docs/introduction/*/*.ipynb 14 | jupyter nbconvert --to markdown docs/recipes/*.ipynb 15 | jupyter nbconvert --to markdown docs/examples/*.ipynb 16 | jupyter nbconvert --to markdown docs/examples/*/*.ipynb 17 | 18 | doc: render-notebooks 19 | (cd benchmarks && python render.py) 20 | python docs/parse river --out docs --verbose 21 | mkdocs build 22 | 23 | livedoc: doc 24 | mkdocs serve --dirtyreload 25 | 26 | rebase: 27 | git fetch && git rebase origin/main 28 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | ## Installation 4 | 5 | The recommended way to run the benchmarks is to create a dedicated environment for river and its contenders. 6 | 7 | An easy way to achieve that is through [Anaconda](https://docs.conda.io/projects/miniconda/en/latest/). Here is an example of creating an environment for the benchmarks: 8 | 9 | ```sh 10 | conda create --name river-benchmark python=3.10 11 | ``` 12 | 13 | The next step is to clone river if you have not done that already: 14 | 15 | ```sh 16 | git clone https://github.com/online-ml/river 17 | cd river 18 | ``` 19 | 20 | From the river folder you can run the following command to install the needed dependencies: 21 | 22 | ```sh 23 | pip install ".[benchmarks]" 24 | ``` 25 | 26 | ## Usage 27 | 28 | The `run.py` script executes the benchmarks and creates the necessary .csv files for rendering the plots. 29 | 30 | ```sh 31 | cd benchmarks 32 | python run.py 33 | ``` 34 | 35 | The `render.py` renders the plots from the .csv files and moves them to the `docs/benchmarks` folder. 36 | 37 | ```sh 38 | python render.py 39 | ``` 40 | 41 | ## Notes: VolpalWabbit 42 | 43 | Installing Volpal Wabbit (VW) can be tricky sometimes. That is especially true when using apple silicon. If cannot make the pip install guidelines from VW work a workaround is the following. When using anaconda, you can install the recommended dependencies utilized for building VW with conda. You can get more info [here](https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Building#conda) about such dependencies. After that, `pip install volpalwabbit` should work just fine. 44 | -------------------------------------------------------------------------------- /benchmarks/model_adapters/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/benchmarks/model_adapters/__init__.py -------------------------------------------------------------------------------- /benchmarks/model_adapters/vw.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from vowpalwabbit import pyvw 4 | 5 | from river import base 6 | 7 | 8 | class VW2RiverBase: 9 | def __init__(self, *args, **kwargs): 10 | self.vw = pyvw.Workspace(*args, **kwargs) 11 | 12 | def _format_x(self, x): 13 | return " ".join(f"{k}:{v}" for k, v in x.items()) 14 | 15 | 16 | class VW2RiverClassifier(VW2RiverBase, base.Classifier): 17 | def learn_one(self, x, y): 18 | # Convert {False, True} to {-1, 1} 19 | y = int(y) 20 | y_vw = 2 * y - 1 21 | 22 | ex = self._format_x(x) 23 | ex = f"{y_vw} | {ex}" 24 | self.vw.learn(ex) 25 | 26 | def predict_proba_one(self, x): 27 | ex = "| " + self._format_x(x) 28 | y_pred = self.vw.predict(ex) 29 | return {True: y_pred, False: 1.0 - y_pred} 30 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | import platform 2 | 3 | import numpy 4 | import setuptools 5 | from Cython.Build import cythonize 6 | from setuptools.command.build_ext import build_ext 7 | from setuptools.errors import CCompilerError 8 | from setuptools_rust import Binding, RustExtension 9 | 10 | ext_modules = cythonize( 11 | module_list=[ 12 | setuptools.Extension( 13 | "*", 14 | sources=["river/**/*.pyx"], 15 | include_dirs=[numpy.get_include()], 16 | libraries=[] if platform.system() == "Windows" else ["m"], 17 | define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")], 18 | ) 19 | ], 20 | compiler_directives={ 21 | "binding": True, 22 | "embedsignature": True, 23 | }, 24 | ) 25 | 26 | rust_extensions = [RustExtension("river.stats._rust_stats", binding=Binding.PyO3)] 27 | 28 | 29 | class BuildFailed(Exception): 30 | pass 31 | 32 | 33 | class ExtBuilder(build_ext): 34 | def run(self): 35 | try: 36 | build_ext.run(self) 37 | except (FileNotFoundError): 38 | raise BuildFailed("File not found. Could not compile C extension.") 39 | 40 | def build_extension(self, ext): 41 | try: 42 | build_ext.build_extension(self, ext) 43 | except (CCompilerError, ValueError): 44 | raise BuildFailed("Could not compile C extension.") 45 | 46 | 47 | def build(setup_kwargs): 48 | """ 49 | This function is mandatory in order to build the extensions. 50 | """ 51 | setup_kwargs.update( 52 | { 53 | "ext_modules": ext_modules, 54 | "cmdclass": {"build_ext": ExtBuilder}, 55 | "rust_extensions": rust_extensions, 56 | "zip_safe": False, 57 | "include_package_data": True, 58 | } 59 | ) 60 | -------------------------------------------------------------------------------- /docs/.pages: -------------------------------------------------------------------------------- 1 | nav: 2 | - introduction 3 | - recipes 4 | - api 5 | - examples 6 | - faq 7 | - releases 8 | - benchmarks 9 | - license 10 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | riverml.xyz 2 | -------------------------------------------------------------------------------- /docs/benchmarks/.pages: -------------------------------------------------------------------------------- 1 | title: Benchmarks 2 | -------------------------------------------------------------------------------- /docs/css/version-select.css: -------------------------------------------------------------------------------- 1 | @media only screen and (max-width:76.1875em) { 2 | #version-selector { 3 | padding: .6rem .8rem; 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /docs/examples/.pages: -------------------------------------------------------------------------------- 1 | title: Examples 🌶️ 2 | -------------------------------------------------------------------------------- /docs/examples/matrix-factorization-for-recommender-systems/.pages: -------------------------------------------------------------------------------- 1 | title: Matrix factorization for recommender systems 2 | -------------------------------------------------------------------------------- /docs/examples/matrix-factorization-for-recommender-systems/part-3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Part 3" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "To do." 15 | ] 16 | } 17 | ], 18 | "metadata": { 19 | "kernelspec": { 20 | "display_name": "Python 3", 21 | "language": "python", 22 | "name": "python3" 23 | }, 24 | "language_info": { 25 | "codemirror_mode": { 26 | "name": "ipython", 27 | "version": 3 28 | }, 29 | "file_extension": ".py", 30 | "mimetype": "text/x-python", 31 | "name": "python", 32 | "nbconvert_exporter": "python", 33 | "pygments_lexer": "ipython3", 34 | "version": "3.11.0" 35 | } 36 | }, 37 | "nbformat": 4, 38 | "nbformat_minor": 4 39 | } 40 | -------------------------------------------------------------------------------- /docs/faq/.pages: -------------------------------------------------------------------------------- 1 | title: FAQ 2 | -------------------------------------------------------------------------------- /docs/img/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/img/favicon.ico -------------------------------------------------------------------------------- /docs/img/favicon_old.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/img/favicon_old.ico -------------------------------------------------------------------------------- /docs/img/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/img/icon.png -------------------------------------------------------------------------------- /docs/img/illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/img/illustration.png -------------------------------------------------------------------------------- /docs/img/illustration_old.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/img/illustration_old.png -------------------------------------------------------------------------------- /docs/img/online_active_learning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/img/online_active_learning.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | template: home.html 3 | title: river 4 | --- 5 | -------------------------------------------------------------------------------- /docs/introduction/.pages: -------------------------------------------------------------------------------- 1 | title: Introduction 🍼 2 | nav: 3 | - installation.md 4 | - basic-concepts.md 5 | - getting-started 6 | - why-use-river.md 7 | - next-steps.md 8 | - related-projects.md 9 | -------------------------------------------------------------------------------- /docs/introduction/getting-started/concept-drift-detection_files/concept-drift-detection_1_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/introduction/getting-started/concept-drift-detection_files/concept-drift-detection_1_0.png -------------------------------------------------------------------------------- /docs/introduction/getting-started/concept-drift-detection_files/concept-drift-detection_3_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/introduction/getting-started/concept-drift-detection_files/concept-drift-detection_3_1.png -------------------------------------------------------------------------------- /docs/introduction/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | River is meant to work with Python 3.8 and above. Installation can be done via `pip`: 4 | 5 | ```sh 6 | pip install river 7 | ``` 8 | 9 | You can install the latest development version from GitHub, as so: 10 | 11 | ```sh 12 | pip install git+https://github.com/online-ml/river --upgrade 13 | pip install git+ssh://git@github.com/online-ml/river.git --upgrade # using SSH 14 | ``` 15 | 16 | This method requires having Cython and Rust installed on your machine. 17 | 18 | Feel welcome to [open an issue on GitHub](https://github.com/online-ml/river/issues/new) if you are having any trouble. 19 | -------------------------------------------------------------------------------- /docs/introduction/next-steps.md: -------------------------------------------------------------------------------- 1 | # Next steps 2 | 3 | The [Recipes 🍱](/latest/recipes/reading-data) section is made up of small tutorials. Each one explains how to perform mundane tasks, such as measuring the performance of a model, selecting hyperparameters, etc. 4 | 5 | The [Examples 🌶️](/latest/examples/batch-to-online) section contains more involved notebooks with less explanations. Each notebook addresses a particular machine learning problem. 6 | 7 | The [API 📚](/latest/api) section references all the modules, classes, and functions in River. It is automatically generated from the codebase's Python docstrings. 8 | 9 | Feel welcome to [open a discussion](https://github.com/online-ml/river/discussions) if you have a question. Before that you can check out the [FAQ 🙋](/latest/faq), which has answers to recurring questions. 10 | 11 | The released versions are listed in the [Releases 🏗](/latest/releases) section. Changes that will be part of the next release are listed in the unreleased section of the documentation's development version, which you may find [here](https://riverml.xyz/latest/releases/unreleased/). 12 | 13 | We recommend checking out [Awesome Online Machine Learning](https://github.com/online-ml/awesome-online-machine-learning) if you want to go deeper. There you will find online machine learning related content: research papers, alternative and complementary software, blog posts, etc. 14 | -------------------------------------------------------------------------------- /docs/introduction/related-projects.md: -------------------------------------------------------------------------------- 1 | # Related projects 2 | 3 | Here is a list of projects which are more or less coupled with River: 4 | 5 | - [deep-river](https://github.com/online-ml/deep-river) interfaces PyTorch models with River. 6 | - [light-river](https://github.com/online-ml/light-river) implements fast algorithms in rust. 7 | - [river-extra](https://github.com/online-ml/river-extra) regroups experimental features which have yet to prove themselves to make it into the main River repository. Between us we call this "the arena". 8 | - [Beaver](https://github.com/online-ml/beaver) is an MLOps tool for covering the whole lifecycle of online machine learning models. 9 | -------------------------------------------------------------------------------- /docs/introduction/why-use-river.md: -------------------------------------------------------------------------------- 1 | # Why use River? 2 | 3 | ## Processing one sample at a time 4 | 5 | All the tools in the library can be updated with a single observation at a time. They can therefore be used to process streaming data. Depending on your use case, this might be more convenient than using a batch model. 6 | 7 | ## Adapting to drift 8 | 9 | In the streaming setting, data can evolve. Adaptive methods are specifically designed to be robust against concept drift in dynamic environments. Many of River's models can cope with concept drift. 10 | 11 | ## General purpose 12 | 13 | River supports different machine learning tasks, including regression, classification, and unsupervised learning. It can also be used for ad hoc tasks, such as computing online metrics, as well as concept drift detection. 14 | 15 | ## User experience 16 | 17 | River is not the only library allowing you to do online machine learning. But it might just be the simplest one to use in the Python ecosystem. River plays nicely with Python dictionaries, therefore making it easy to use in the context of web applications where JSON payloads are aplenty. 18 | -------------------------------------------------------------------------------- /docs/javascripts/config.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.typesetPromise() 16 | }) 17 | -------------------------------------------------------------------------------- /docs/javascripts/tablesort.js: -------------------------------------------------------------------------------- 1 | document$.subscribe(function() { 2 | var tables = document.querySelectorAll("article table:not([class])") 3 | tables.forEach(function(table) { 4 | new Tablesort(table) 5 | }) 6 | }) 7 | -------------------------------------------------------------------------------- /docs/license/.pages: -------------------------------------------------------------------------------- 1 | title: License 📝 2 | 3 | -------------------------------------------------------------------------------- /docs/license/license.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | River is free and open-source software licensed under the [3-clause BSD license](https://github.com/online-ml/river/blob/main/LICENSE). -------------------------------------------------------------------------------- /docs/overrides/partials/footer.html: -------------------------------------------------------------------------------- 1 | {% import "partials/language.html" as lang with context %} 2 | 3 | 4 | 31 | -------------------------------------------------------------------------------- /docs/overrides/partials/integrations/analytics.html: -------------------------------------------------------------------------------- 1 | 6 | -------------------------------------------------------------------------------- /docs/recipes/.pages: -------------------------------------------------------------------------------- 1 | title: Recipes 🌮 2 | nav: 3 | - reading-data.md 4 | - model-evaluation.md 5 | - pipelines.md 6 | - feature-extraction.md 7 | - hyperparameter-tuning.md 8 | - mini-batching.md 9 | - on-hoeffding-trees.md 10 | - active-learning.md 11 | - bandits-101.md 12 | - cloning-and-mutating.md 13 | - rolling-computations.md 14 | -------------------------------------------------------------------------------- /docs/recipes/feature-extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature extraction\n", 8 | "\n", 9 | "To do." 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.11.0" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 4 34 | } 35 | -------------------------------------------------------------------------------- /docs/recipes/hyperparameter-tuning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hyperparameter tuning\n", 8 | "\n", 9 | "To do." 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.11.0" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 4 34 | } 35 | -------------------------------------------------------------------------------- /docs/recipes/model-evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model evaluation\n", 8 | "\n", 9 | "To do." 10 | ] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 3", 16 | "language": "python", 17 | "name": "python3" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 3 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython3", 29 | "version": "3.11.0" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 4 34 | } 35 | -------------------------------------------------------------------------------- /docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_21_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_21_0.png -------------------------------------------------------------------------------- /docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_23_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_23_0.png -------------------------------------------------------------------------------- /docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_25_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_25_0.png -------------------------------------------------------------------------------- /docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_27_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_27_0.png -------------------------------------------------------------------------------- /docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_29_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/docs/recipes/on-hoeffding-trees_files/on-hoeffding-trees_29_0.png -------------------------------------------------------------------------------- /docs/releases/.pages: -------------------------------------------------------------------------------- 1 | title: Releases 2 | sort_type: natural 3 | order: desc 4 | -------------------------------------------------------------------------------- /docs/releases/0.0.2.md: -------------------------------------------------------------------------------- 1 | # 0.0.2 - 2019-02-13 2 | 3 | - [PyPI](https://pypi.org/project/river/0.0.2/) 4 | - [GitHub](https://github.com/online-ml/river/releases/tag/0.0.2) 5 | 6 | ## compat 7 | 8 | - Added `sklearn` wrappers. 9 | 10 | ## ensemble 11 | 12 | - Added `ensemble.HedgeClassifier`. 13 | 14 | ## feature_selection 15 | 16 | - Added `feature_selection.RandomDiscarder`. 17 | 18 | ## feature_extraction 19 | 20 | - Added `feature_extraction.TargetEncoder`. 21 | 22 | ## impute 23 | 24 | - Added `impute.NumericImputer`. 25 | 26 | ## optim 27 | 28 | - Added `optim.AbsoluteLoss`. 29 | - Added `optim.HingeLoss`. 30 | - Added `optim.EpsilonInsensitiveHingeLoss`. 31 | 32 | ## stats 33 | 34 | - Added `stats.NUnique`. 35 | - Added `stats.Min`. 36 | - Added `stats.Max`. 37 | - Added `stats.PeakToPeak`. 38 | - Added `stats.Kurtosis`. 39 | - Added `stats.Skew`. 40 | - Added `stats.Sum`. 41 | - Added `stats.EWMean`. 42 | - Made sure the running statistics produce the same results as `pandas.DataFrame.rolling` method. 43 | -------------------------------------------------------------------------------- /docs/releases/0.10.1.md: -------------------------------------------------------------------------------- 1 | # 0.10.1 - 2022-02-05 2 | 3 | ## evaluate 4 | 5 | `evaluate.progressive_val_score` can now handle models which use `**kwargs` in their `learn_one` and `predict_one` methods. For instance, this is useful for `reco.Ranker` models which require passing a user and an item. 6 | -------------------------------------------------------------------------------- /docs/releases/0.11.1.md: -------------------------------------------------------------------------------- 1 | # 0.11.1 - 2022-06-06 2 | 3 | A small release to introduce benchmarks. 4 | 5 | ## anomaly 6 | 7 | - Fixed a bug where anomaly filters were never updated. 8 | -------------------------------------------------------------------------------- /docs/releases/0.12.1.md: -------------------------------------------------------------------------------- 1 | # 0.12.1 - 2022-09-02 2 | 3 | ## base 4 | 5 | - Fix the way the `clone` method handles positional arguments. 6 | -------------------------------------------------------------------------------- /docs/releases/0.13.0.md: -------------------------------------------------------------------------------- 1 | # 0.13.0 - 2022-09-15 2 | 3 | ## compose 4 | 5 | - `compose.TransformerUnion` parts can now be accessed by index as well as by name. 6 | 7 | ## stats 8 | 9 | - Added the `LossyCount` for tracking frequent itemsets. This implementation also supports a forgetting factor to reduce the influence of old elements. 10 | - The following statistics are now implemented in Rust: 11 | - `Quantile` 12 | - `EWMean` 13 | - `EWVar` 14 | - `IQR` 15 | - `Kurtosis` 16 | - `PeaktoPeak` 17 | - `Skew` 18 | - `RollingQuantile` 19 | - `RollingIQR` 20 | 21 | ## stream 22 | 23 | - Implemented `stream.TwitchChatStream`. 24 | -------------------------------------------------------------------------------- /docs/releases/0.14.0.md: -------------------------------------------------------------------------------- 1 | # 0.14.0 - 2022-10-26 2 | 3 | - Introducing the `bandit` module for running multi-armed bandits 4 | - Introducing the `sketch` module with summarization tools and data sketches working in a streaming fashion! 5 | 6 | ## bandit 7 | 8 | - Added `bandit.EpsilonGreedy`. 9 | - Added `bandit.UCB`. 10 | - Added `bandit.ThomsonSampling`. 11 | - Added a `bandit.base` module. 12 | - Added `bandit.envs.CandyCaneContest`, which implements the Gym interface. 13 | - Added `bandit.envs.KArmedTestbed`, which implements the Gym interface. 14 | - Added `bandit.evaluate` for basic benchmarking of bandit policies on a Gym environment. 15 | 16 | ## drift 17 | 18 | - Exposed more parameters in ADWIN: `clock`, `max_buckets`, `min_window_length`, and `grace_period`. 19 | 20 | ## model_selection 21 | 22 | - Added `model_selection.BanditRegressor`, which is a generic model selection method that works with any bandit policy. 23 | - Removed `model_selection.EpsilonGreedyRegressor` due to the addition of `model_selection.BanditRegressor`. 24 | - Removed `model_selection.UCBRegressor` due to the addition of `model_selection.BanditRegressor`. 25 | 26 | ## proba 27 | 28 | - Added `proba.Beta`. 29 | - Added a `sample` method to each distribution. 30 | - Added a `mode` property to each distribution. 31 | - Replaced the `pmf` and `pdf` methods with a `__call__` method. 32 | 33 | ## sketch 34 | 35 | - Moved `misc.Histogram` to `sketch.Histogram`. 36 | - Moved `stats.LossyCount` to `sketch.HeavyHitters` and update its API to better match `collections.Counter`. 37 | - Added missing return `self` in `HeavyHitters`. 38 | - Added the Count-Min Sketch (`sketch.Counter`) algorithm for approximate element counting. 39 | - Added an implementation of Bloom filter (`sketch.Set`) to provide approximate set-like operations. 40 | -------------------------------------------------------------------------------- /docs/releases/0.16.0.md: -------------------------------------------------------------------------------- 1 | # 0.16.0 - 2023-05-08 2 | 3 | Added wheels for Python 3.11. 4 | 5 | ## feature_extraction 6 | 7 | - `feature_extraction.Agg` and `feature_extraction.TargetAgg` can now be passed an optional `t` in its `learn_one` method, which allows it to work with `utils.TimeRolling`. 8 | 9 | ## metrics 10 | 11 | - Added `metrics.MAPE`. 12 | - Added `metrics.RollingROCAUC`. 13 | 14 | ## preprocessing 15 | 16 | - Added `preprocessing.GaussianRandomProjector`. 17 | - Added `preprocessing.SparseRandomProjector`. 18 | 19 | ## stats 20 | 21 | - Fixed randomness issue with the first few outputs of `stats.Quantile`. 22 | -------------------------------------------------------------------------------- /docs/releases/0.17.0.md: -------------------------------------------------------------------------------- 1 | # 0.17.0 - 2023-05-27 2 | 3 | ## bandit 4 | 5 | - Bandit policies now return a single arm when the `pull` method is called, instead of yielding or one more arms at a time. This is simpler to understand. We will move back to multi-armed pulls in the future. 6 | - Added `bandit.Exp3`. 7 | - `bandit.UCB` and `bandit.Exp3` have an extra `reward_scaler` parameter, which can be any object that inherits from `compose.TargetTransformRegressor`. This allows scaling rewards before updating arms. 8 | 9 | ## compose 10 | 11 | - `compose.TransformerProduct` now correctly returns a `compose.TransformerUnion` when a transformer is added to it. 12 | - Fixed `compose.TransformerProduct`'s `transform_many` behavior. 13 | - `compose.TransformerUnion` and `compose.TransformerProduct` will now clone the provided estimators, so that shallow copies aren't shared in different places. 14 | 15 | ## model_selection 16 | 17 | - Added `model_selection.BanditClassifier`, which is the classification equivalent to `bandit.BanditRegressor`. Both are methods to perform online model selection via a bandit policy. 18 | 19 | ## multioutput 20 | 21 | - `metrics.multioutput.MacroAverage` and `metrics.multioutput.MicroAverage` now loop over the keys of `y_true` instead of `y_pred`. This ensures a `KeyError` is correctly raised if `y_pred` is missing an output that is present in `y_true`. 22 | 23 | ## preprocessing 24 | 25 | - Added `preprocessing.TargetMinMaxScaler`, which operates the same as `preprocessing.TargetStandardScaler`, but instead uses min-max scaling. 26 | -------------------------------------------------------------------------------- /docs/releases/0.20.1.md: -------------------------------------------------------------------------------- 1 | # 0.20.1 - 2023-11-09 2 | 3 | Dummy release to make wheels available. No actual difference with v0.20.0. 4 | -------------------------------------------------------------------------------- /docs/releases/0.21.0.md: -------------------------------------------------------------------------------- 1 | # 0.21.0 - 2023-12-04 2 | 3 | - The `learn_one` and `learn_many` methods of each estimator don't not return anything anymore. This is to emphasize that the estimators are stateful. 4 | - The `update` and `revert` method of classes that have also cease to return anything. 5 | - `sample_weight` has been renamed to `w`. 6 | 7 | ## covariance 8 | 9 | - Fixed an issue where `update_many` would reset `covariance.EmpiricalCovariance` each time it was called. 10 | -------------------------------------------------------------------------------- /docs/releases/0.21.1.md: -------------------------------------------------------------------------------- 1 | # 0.21.1 - 2024-03-28 2 | 3 | This release should fix some of the installation issues when building the River wheel from scratch. 4 | 5 | ## anomaly 6 | 7 | - Added `PredictiveAnomalyDetection`, a semi-supervised technique that employs a predictive model for anomaly detection. 8 | 9 | ## drift 10 | 11 | - Added `FHDDM` drift detector. 12 | - Added a `iter_polars` function to iterate over the rows of a polars DataFrame. 13 | 14 | ## neighbors 15 | 16 | - Simplified `neighbors.SWINN` to avoid recursion limit and pickling issues. 17 | -------------------------------------------------------------------------------- /docs/releases/0.21.2.md: -------------------------------------------------------------------------------- 1 | # 0.21.2 - 2024-07-08 2 | 3 | This release makes Polars an optional dependency instead of a required one. 4 | 5 | ## cluster 6 | 7 | - Added `ODAC` (Online Divisive-Agglomerative Clustering) for clustering time series. 8 | 9 | ## forest 10 | 11 | - Fix error in `forest.ARFClassifer` and `forest.ARFRegressor` where the algorithms would crash in case the number of features available for learning went below the value of the `max_features` parameter (#1560). 12 | -------------------------------------------------------------------------------- /docs/releases/0.22.0.md: -------------------------------------------------------------------------------- 1 | # 0.22.0 - 2024-11-23 2 | 3 | - Dropped support for Python 3.9 and added support for Python 3.13. 4 | - The methods `learn_one`, `learn_many`, `update`, `revert`, and `append` now return `None`. 5 | - The units used in River have been corrected to be based on powers of 2 (KiB, MiB). This only changes the display, the behaviour is unchanged. 6 | 7 | ## cluster 8 | 9 | - Update the description of `cluster.ODAC`. 10 | - Change `draw` in `cluster.ODAC` to draw the hierarchical cluster's structure as a Graphviz graph. 11 | - Add `render_ascii` in `cluster.ODAC` to render the hierarchical cluster's structure in text format. 12 | - Work with `stats.Var` in `cluster.ODAC` when cluster has only one time series. 13 | 14 | ## drift 15 | 16 | - Make `drift.ADWIN` comply with the reference MOA implementation. 17 | 18 | ## feature extraction 19 | 20 | - The mini-batch methods for `feature_extraction.TFIDF` now systematically raise an exception, as they are not implemented. 21 | 22 | ## stats 23 | 24 | - Removed the unexported class `stats.CentralMoments`. 25 | 26 | ## tree 27 | 28 | - Instead of letting trees grow indefinitely, setting the `max_depth` parameter to `None` will stop the trees from growing when they reach the system recursion limit. 29 | - Added `tree.LASTClassifier` (Local Adaptive Streaming Tree Classifier). 30 | 31 | ## stream 32 | 33 | - `stream.iter_arff` now supports blank values (treated as missing values). 34 | -------------------------------------------------------------------------------- /docs/releases/0.3.0.md: -------------------------------------------------------------------------------- 1 | # 0.3.0 - 2019-06-23 2 | 3 | - [PyPI](https://pypi.org/project/river/0.3.0/) 4 | - [GitHub](https://github.com/online-ml/river/releases/tag/0.3.0) 5 | 6 | ## datasets 7 | 8 | - Added `datasets.load_chick_weights`. 9 | 10 | ## decomposition 11 | 12 | - Added `decomposition.LDA`. 13 | 14 | ## ensemble 15 | 16 | - Added `ensemble.HedgeRegressor`. 17 | - Added `ensemble.StackingBinaryClassifier`. 18 | 19 | ## metrics 20 | 21 | - Added `metrics.FBeta` 22 | - Added `metrics.MacroFBeta` 23 | - Added `metrics.MicroFBeta` 24 | - Added `metrics.MultiFBeta` 25 | - Added `metrics.RollingFBeta` 26 | - Added `metrics.RollingMacroFBeta` 27 | - Added `metrics.RollingMicroFBeta` 28 | - Added `metrics.RollingMultiFBeta` 29 | - Added `metrics.Jaccard` 30 | - Added `metrics.RollingConfusionMatrix` 31 | - Added `metrics.RegressionMultiOutput` 32 | - Added `metrics.MCC` 33 | - Added `metrics.RollingMCC` 34 | - Added `metrics.ROCAUC` 35 | - Renamed `metrics.F1Score` to `metrics.F1`. 36 | 37 | ## multioutput 38 | 39 | - Added `multioutput.ClassifierChain`. 40 | - Added `multioutput.RegressorChain`. 41 | 42 | ## optim 43 | 44 | - Added `optim.QuantileLoss` 45 | - Added `optim.MiniBatcher`. 46 | 47 | ## preprocessing 48 | 49 | - Added `preprocessing.Normalizer`. 50 | 51 | ## proba 52 | 53 | - Added `proba.Multinomial`. 54 | -------------------------------------------------------------------------------- /docs/releases/0.4.3.md: -------------------------------------------------------------------------------- 1 | # 0.4.3 - 2019-10-27 2 | 3 | - [PyPI](https://pypi.org/project/river/0.4.3/) 4 | - [GitHub](https://github.com/online-ml/river/releases/tag/0.4.3) 5 | 6 | ## base 7 | 8 | - Model that inherit from `base.Wrapper` (e.g. `tree.RandomForestClassifier`) can now be pickled. 9 | 10 | ## datasets 11 | 12 | - Added `datasets.fetch_credit_card`. 13 | 14 | ## utils 15 | 16 | - Added the `utils.math` sub-module. 17 | 18 | ## tree 19 | 20 | - Fixed the `debug_one` method of `tree.DecisionTreeClassifier`. 21 | -------------------------------------------------------------------------------- /docs/releases/0.4.4.md: -------------------------------------------------------------------------------- 1 | # 0.4.4 - 2019-11-11 2 | 3 | - [PyPI](https://pypi.org/project/river/0.4.4/) 4 | - [GitHub](https://github.com/online-ml/river/releases/tag/0.4.4) 5 | 6 | This release was mainly made to provide access to `wheels `_ for Windows and MacOS. 7 | 8 | ## ensemble 9 | 10 | - Added `ensemble.AdaBoostClassifier`. 11 | 12 | ## linear_model 13 | 14 | - Added a `clip_gradient` parameter to `linear_model.LinearRegression` and `linear_model.LogisticRegression`. Gradient clipping was already implemented, but the maximum absolute value can now be set by the user. 15 | - The `intercept_lr` parameter of `linear_model.LinearRegression` and `linear_model.LogisticRegression` can now be passed an instance of `optim.schedulers.Scheduler` as well as a `float`. 16 | 17 | ## metrics 18 | 19 | - Fixed `metrics.SMAPE`, the implementation was missing a multiplication by 2. 20 | 21 | ## optim 22 | 23 | - Added `optim.schedulers.Optimal` produces results that are identical to `sklearn.linear_model.SGDRegressor` and `sklearn.linear_model.SGDClassifier` when setting their `learning_rate` parameter to `'optimal'`. 24 | 25 | ## time_series 26 | 27 | - Added `time_series.SNARIMAX`, a generic model which encompasses well-known time series models such as ARIMA and NARX. 28 | -------------------------------------------------------------------------------- /docs/releases/0.5.1.md: -------------------------------------------------------------------------------- 1 | # 0.5.1 - 2020-03-29 2 | 3 | - [PyPI](https://pypi.org/project/river/0.5.1/) 4 | - [GitHub](https://github.com/online-ml/river/releases/tag/0.5.1) 5 | 6 | ## compose 7 | 8 | - `compose.Pipeline` and `compose.TransformerUnion` now variadic arguments as input instead of a list. This doesn't change anything when using the shorthand operators `|` and `+`. 9 | 10 | ## model_selection 11 | 12 | - Removed `model_selection.successive_halving` 13 | - Added `model_selection.SuccessiveHalvingRegressor` and `model_selection.SuccessiveHalvingClassifier` 14 | 15 | ## stream 16 | 17 | - Added a `copy` parameter to `stream.simulate_qa` in order to handle unwanted feature modifications. 18 | 19 | ## tree 20 | 21 | - Added a `curtail_under` parameter to `tree.DecisionTreeClassifier`. 22 | - The speed and accuracy of both `tree.DecisionTreeClassifier` and `tree.RandomForestClassifier` has been slightly improved for numerical attributes. 23 | - The esthetics of the `tree.DecisionTreeClassifier.draw` method have been improved. 24 | -------------------------------------------------------------------------------- /docs/releases/0.6.1.md: -------------------------------------------------------------------------------- 1 | # 0.6.1 - 2020-06-10 2 | 3 | ## compose 4 | 5 | - Fixed a bug that occurred when part of a `compose.Transformer` was a `compose.Pipeline` and wasn't properly handled. 6 | -------------------------------------------------------------------------------- /docs/releases/0.7.0.md: -------------------------------------------------------------------------------- 1 | # 0.7.0 - 2021-04-16 2 | 3 | Alas, no release notes for this one. 4 | -------------------------------------------------------------------------------- /docs/releases/0.7.1.md: -------------------------------------------------------------------------------- 1 | # 0.7.1 - 2021-06-13 2 | 3 | Fixed an issue where scikit-learn was imported in `sam_knn.py` but wasn't specified as a dependency. 4 | 5 | ## expert 6 | 7 | - Each expert model will now raise a `NotEnoughModels` exception if only a single model is passed. 8 | 9 | ## stream 10 | 11 | - Added `drop_nones` parameter to `stream.iter_csv`. 12 | -------------------------------------------------------------------------------- /docs/releases/0.8.0.md: -------------------------------------------------------------------------------- 1 | # 0.8.0 - 2021-08-31 2 | 3 | ## base 4 | 5 | - The `predict_many` and `predict_proba_many` methods have been removed from `base.Classifier`. They're part of `base.MiniBatchClassifier`. 6 | 7 | ## ensemble 8 | 9 | - Implemented `ensemble.VotingClassifier`. 10 | - Implemented `ensemble.SRPRegressor`. 11 | 12 | ## meta 13 | 14 | - Renamed `meta.TransformedTargetRegressor` to `meta.TargetTransformRegressor`. 15 | - Added `meta.TargetStandardScaler`. 16 | 17 | ## preprocessing 18 | 19 | - Added a `with_std` parameter to `StandardScaler`. 20 | 21 | ## rules 22 | 23 | - Added `rules.AMRules` 24 | 25 | ## stats 26 | 27 | - Make `stats.RollingQuantile` match the default behavior of Numpy's `quantile` function. 28 | 29 | ## tree 30 | 31 | - Unified base class structure applied to all tree models. 32 | - Bug fixes. 33 | - Added `tree.SGTClassifier` and `tree.SGTRegressor`. 34 | -------------------------------------------------------------------------------- /docs/releases/unreleased.md: -------------------------------------------------------------------------------- 1 | # Unreleased 2 | 3 | ## base 4 | 5 | - The `tags` and `more_tags` properties of `base.Estimator` are now both a set of strings. 6 | - The `base` module is now fully type-annotated. Some type hints have changed, but this does not impact the behaviour of the code. For instance, the regression target is now indicated as a float instead of a Number. 7 | - `base.Ensemble`, `base.Wrapper`, and `base.WrapperEnsemble` became generic with regard to the type they encapsulate. 8 | -------------------------------------------------------------------------------- /river/__init__.py: -------------------------------------------------------------------------------- 1 | """River is a library for incremental learning. Incremental learning is a machine learning regime 2 | where the observations are made available one by one. It is also known as online learning, 3 | iterative learning, or sequential learning. This is in contrast to batch learning where all the 4 | data is processed at once. Incremental learning is desirable when the data is too big to fit in 5 | memory, or simply when it isn't available all at once. river's API is heavily inspired from that of 6 | scikit-learn, enough so that users who are familiar with scikit-learn should feel right at home. 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | from .__version__ import __version__ # noqa: F401 12 | -------------------------------------------------------------------------------- /river/__version__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | __version__ = "0.22.0" 4 | -------------------------------------------------------------------------------- /river/active/__init__.py: -------------------------------------------------------------------------------- 1 | """Online active learning.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import base 6 | from .entropy import EntropySampler 7 | 8 | __all__ = ["base", "EntropySampler"] 9 | -------------------------------------------------------------------------------- /river/anomaly/__init__.py: -------------------------------------------------------------------------------- 1 | """Anomaly detection. 2 | 3 | Estimators in the `anomaly` module have a bespoke API. Each anomaly detector has a `score_one` 4 | method instead of a `predict_one` method. This method returns an anomaly score. Normal observations 5 | should have a low score, whereas anomalous observations should have a high score. The range of the 6 | scores is relative to each estimator. 7 | 8 | Anomaly detectors are usually unsupervised, in that they analyze the distribution of the features 9 | they are shown. But River also has a notion of supervised anomaly detectors. These analyze the 10 | distribution of a target variable, and optionally include the distribution of the features as well. They are useful for detecting labelling anomalies, which can be detrimental if they learned by a 11 | model. 12 | 13 | """ 14 | 15 | from __future__ import annotations 16 | 17 | from . import base 18 | from .filter import QuantileFilter, ThresholdFilter 19 | from .gaussian import GaussianScorer 20 | from .hst import HalfSpaceTrees 21 | from .lof import LocalOutlierFactor 22 | from .pad import PredictiveAnomalyDetection 23 | from .sad import StandardAbsoluteDeviation 24 | from .svm import OneClassSVM 25 | 26 | __all__ = [ 27 | "base", 28 | "AnomalyDetector", 29 | "GaussianScorer", 30 | "HalfSpaceTrees", 31 | "OneClassSVM", 32 | "QuantileFilter", 33 | "StandardAbsoluteDeviation", 34 | "ThresholdFilter", 35 | "LocalOutlierFactor", 36 | "PredictiveAnomalyDetection", 37 | ] 38 | -------------------------------------------------------------------------------- /river/anomaly/test_hst.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def test_missing_features(): 5 | """Checks that HalfSpaceTrees works even if a feature is missing. 6 | 7 | >>> import random 8 | >>> from river import anomaly 9 | >>> from river import compose 10 | >>> from river import datasets 11 | >>> from river import metrics 12 | >>> from river import preprocessing 13 | 14 | >>> model = compose.Pipeline( 15 | ... preprocessing.MinMaxScaler(), 16 | ... anomaly.HalfSpaceTrees(seed=42) 17 | ... ) 18 | 19 | >>> auc = metrics.ROCAUC() 20 | 21 | >>> features = list(next(iter(datasets.CreditCard()))[0].keys()) 22 | >>> random.seed(42) 23 | 24 | >>> for x, y in datasets.CreditCard().take(8000): 25 | ... del x[random.choice(features)] 26 | ... score = model.score_one(x) 27 | ... model.learn_one(x, y) 28 | ... auc.update(y, score) 29 | 30 | >>> auc 31 | ROCAUC: 88.68% 32 | 33 | """ 34 | -------------------------------------------------------------------------------- /river/anomaly/test_svm.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | import pytest 6 | from sklearn import linear_model as sklm 7 | 8 | from river import anomaly, datasets, optim 9 | 10 | tests = { 11 | "Vanilla": ( 12 | {"optimizer": optim.SGD(1e-2), "nu": 0.5}, 13 | {"learning_rate": "constant", "eta0": 1e-2, "nu": 0.5}, 14 | ), 15 | "No intercept": ( 16 | {"optimizer": optim.SGD(1e-2), "nu": 0.5, "intercept_lr": 0.0}, 17 | {"learning_rate": "constant", "eta0": 1e-2, "nu": 0.5, "fit_intercept": False}, 18 | ), 19 | } 20 | 21 | 22 | @pytest.mark.parametrize( 23 | "river_params, sklearn_params", 24 | tests.values(), 25 | ids=tests.keys(), 26 | ) 27 | def test_sklearn_coherence(river_params, sklearn_params): 28 | """Checks that the sklearn and river implementations produce the same results.""" 29 | 30 | rv = anomaly.OneClassSVM(**river_params) 31 | sk = sklm.SGDOneClassSVM(**sklearn_params) 32 | 33 | for x, _ in datasets.Phishing().take(100): 34 | rv.learn_one(x) 35 | sk.partial_fit([list(x.values())]) 36 | 37 | for i, w in enumerate(rv.weights.values()): 38 | assert math.isclose(w, sk.coef_[i]) 39 | -------------------------------------------------------------------------------- /river/api.py: -------------------------------------------------------------------------------- 1 | """River API module.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import ( 6 | active, 7 | anomaly, 8 | bandit, 9 | base, 10 | cluster, 11 | compat, 12 | compose, 13 | conf, 14 | covariance, 15 | datasets, 16 | drift, 17 | dummy, 18 | ensemble, 19 | evaluate, 20 | facto, 21 | feature_extraction, 22 | feature_selection, 23 | forest, 24 | imblearn, 25 | linear_model, 26 | metrics, 27 | misc, 28 | model_selection, 29 | multiclass, 30 | multioutput, 31 | naive_bayes, 32 | neighbors, 33 | neural_net, 34 | optim, 35 | preprocessing, 36 | proba, 37 | reco, 38 | rules, 39 | sketch, 40 | stats, 41 | stream, 42 | time_series, 43 | tree, 44 | utils, 45 | ) 46 | 47 | __all__ = [ 48 | "active", 49 | "anomaly", 50 | "base", 51 | "bandit", 52 | "cluster", 53 | "compat", 54 | "compose", 55 | "conf", 56 | "covariance", 57 | "datasets", 58 | "dummy", 59 | "drift", 60 | "ensemble", 61 | "evaluate", 62 | "facto", 63 | "feature_extraction", 64 | "feature_selection", 65 | "forest", 66 | "imblearn", 67 | "linear_model", 68 | "metrics", 69 | "misc", 70 | "model_selection", 71 | "multiclass", 72 | "multioutput", 73 | "naive_bayes", 74 | "neighbors", 75 | "neural_net", 76 | "optim", 77 | "preprocessing", 78 | "proba", 79 | "reco", 80 | "rules", 81 | "sketch", 82 | "stats", 83 | "stream", 84 | "time_series", 85 | "tree", 86 | "utils", 87 | ] 88 | -------------------------------------------------------------------------------- /river/bandit/__init__.py: -------------------------------------------------------------------------------- 1 | """Multi-armed bandit (MAB) policies. 2 | 3 | The bandit policies in River have a generic API. This allows them to be used in a variety of 4 | situations. For instance, they can be used for model selection 5 | (see `model_selection.BanditRegressor`). 6 | 7 | """ 8 | 9 | from __future__ import annotations 10 | 11 | from . import base, datasets, envs 12 | from .bayes_ucb import BayesUCB 13 | from .epsilon_greedy import EpsilonGreedy 14 | from .evaluate import evaluate, evaluate_offline 15 | from .exp3 import Exp3 16 | from .lin_ucb import LinUCBDisjoint 17 | from .random import RandomPolicy 18 | from .thompson import ThompsonSampling 19 | from .ucb import UCB 20 | 21 | __all__ = [ 22 | "base", 23 | "datasets", 24 | "envs", 25 | "evaluate", 26 | "evaluate_offline", 27 | "BayesUCB", 28 | "EpsilonGreedy", 29 | "Exp3", 30 | "LinUCBDisjoint", 31 | "ThompsonSampling", 32 | "UCB", 33 | "RandomPolicy", 34 | ] 35 | -------------------------------------------------------------------------------- /river/bandit/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .base import BanditDataset 4 | from .news import NewsArticles 5 | 6 | __all__ = ["BanditDataset", "NewsArticles"] 7 | -------------------------------------------------------------------------------- /river/bandit/datasets/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | 5 | from river import bandit, datasets 6 | 7 | 8 | class BanditDataset(datasets.base.Dataset): 9 | """Base class for bandit datasets. 10 | 11 | Parameters 12 | ---------- 13 | n_features 14 | Number of features in the dataset. 15 | n_samples 16 | Number of samples in the dataset. 17 | n_classes 18 | Number of classes in the dataset, only applies to classification datasets. 19 | n_outputs 20 | Number of outputs the target is made of, only applies to multi-output datasets. 21 | sparse 22 | Whether the dataset is sparse or not. 23 | 24 | """ 25 | 26 | def __init__( 27 | self, 28 | n_features, 29 | n_samples=None, 30 | n_classes=None, 31 | n_outputs=None, 32 | sparse=False, 33 | ): 34 | super().__init__( 35 | task="BANDIT", 36 | n_features=n_features, 37 | n_samples=n_samples, 38 | n_classes=n_classes, 39 | n_outputs=n_outputs, 40 | sparse=sparse, 41 | ) 42 | 43 | @abc.abstractproperty 44 | def arms(self) -> list[bandit.base.ArmID]: 45 | """The list of arms that can be pulled.""" 46 | 47 | @property 48 | def _repr_content(self): 49 | return {**super()._repr_content, "Arms": f"{len(self.arms):,d}"} 50 | -------------------------------------------------------------------------------- /river/bandit/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | try: 4 | import gymnasium as gym 5 | 6 | GYM_INSTALLED = True 7 | except ImportError: 8 | GYM_INSTALLED = False 9 | 10 | if GYM_INSTALLED: 11 | from .candy_cane import CandyCaneContest 12 | from .testbed import KArmedTestbed 13 | 14 | __all__ = ["CandyCaneContest", "KArmedTestbed"] 15 | 16 | RIVER_NAMESPACE = "river_bandits" 17 | 18 | if (env_id := f"{RIVER_NAMESPACE}/CandyCaneContest-v0") not in gym.envs.registration.registry: 19 | gym.envs.registration.register( 20 | id=env_id, 21 | entry_point="river.bandit.envs:CandyCaneContest", 22 | max_episode_steps=CandyCaneContest.n_steps, 23 | ) 24 | if (env_id := f"{RIVER_NAMESPACE}/KArmedTestbed-v0") not in gym.envs.registration.registry: 25 | gym.envs.registration.register( 26 | id=env_id, 27 | entry_point="river.bandit.envs:KArmedTestbed", 28 | max_episode_steps=KArmedTestbed.n_steps, 29 | ) 30 | -------------------------------------------------------------------------------- /river/bandit/envs/testbed.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | import gymnasium as gym 6 | 7 | 8 | class KArmedTestbed(gym.Env): 9 | """k-armed testbed. 10 | 11 | This is a simple environment that can be used to test bandit algorithms. It is based on the 12 | 10 armed testbed described in the book "Reinforcement Learning: An Introduction" by Sutton and 13 | Barto. 14 | 15 | Parameters 16 | ---------- 17 | k 18 | Number of arms. 19 | 20 | """ 21 | 22 | n_steps = 1000 23 | 24 | def __init__(self, k: int = 10): 25 | super().__init__() 26 | self.k = k 27 | self.action_space = gym.spaces.Discrete(k) 28 | self.observation_space = gym.spaces.Discrete(k) 29 | self.reward_range = (-math.inf, math.inf) 30 | 31 | def reset(self, seed=None, options=None): 32 | super().reset(seed=seed) 33 | self._actual_rewards = self.np_random.normal(loc=0, scale=1, size=self.k).tolist() 34 | self._best_arm = max(enumerate(self._actual_rewards), key=lambda x: x[1])[0] 35 | observation = self._best_arm 36 | info = {} 37 | return observation, info 38 | 39 | def step(self, arm): 40 | arm_reward = self._actual_rewards[arm] 41 | reward = self.np_random.normal(loc=arm_reward, scale=1) 42 | 43 | observation = self._best_arm 44 | info = {} 45 | terminated = False 46 | truncated = False 47 | return observation, reward, terminated, truncated, info 48 | -------------------------------------------------------------------------------- /river/bandit/test_envs.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import gymnasium as gym 4 | import gymnasium.utils.env_checker 5 | import pytest 6 | 7 | from river import bandit 8 | 9 | 10 | def _iter_envs(): 11 | for env_name in gym.envs.registry: 12 | if env_name.startswith(bandit.envs.RIVER_NAMESPACE): 13 | yield gym.make(env_name) 14 | 15 | 16 | @pytest.mark.parametrize( 17 | "env", 18 | [pytest.param(env, id=env.unwrapped.__class__.__name__) for env in _iter_envs()], 19 | ) 20 | def test_gym_check_env(env): 21 | gym.utils.env_checker.check_env(env.unwrapped) 22 | -------------------------------------------------------------------------------- /river/base/clusterer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | from typing import Any 5 | 6 | from . import estimator, typing 7 | 8 | 9 | class Clusterer(estimator.Estimator): 10 | """A clustering model.""" 11 | 12 | @property 13 | def _supervised(self) -> bool: 14 | return False 15 | 16 | @abc.abstractmethod 17 | def learn_one(self, x: dict[typing.FeatureName, Any]) -> None: 18 | """Update the model with a set of features `x`. 19 | 20 | Parameters 21 | ---------- 22 | x 23 | A dictionary of features. 24 | 25 | """ 26 | 27 | @abc.abstractmethod 28 | def predict_one(self, x: dict[typing.FeatureName, Any]) -> int: 29 | """Predicts the cluster number for a set of features `x`. 30 | 31 | Parameters 32 | ---------- 33 | x 34 | A dictionary of features. 35 | 36 | Returns 37 | ------- 38 | A cluster number. 39 | 40 | """ 41 | -------------------------------------------------------------------------------- /river/base/ensemble.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import UserList 4 | from collections.abc import Iterator 5 | from random import Random 6 | from typing import TypeVar 7 | 8 | from .estimator import Estimator 9 | from .wrapper import Wrapper 10 | 11 | T = TypeVar("T", bound=Estimator) 12 | 13 | 14 | class Ensemble(UserList[T]): 15 | """An ensemble is a model which is composed of a list of models. 16 | 17 | Parameters 18 | ---------- 19 | models 20 | 21 | """ 22 | 23 | def __init__(self, models: Iterator[T]) -> None: 24 | super().__init__(models) 25 | 26 | if len(self) < self._min_number_of_models: 27 | raise ValueError( 28 | f"At least {self._min_number_of_models} models are expected, " 29 | + f"only {len(self)} were passed" 30 | ) 31 | 32 | @property 33 | def _min_number_of_models(self) -> int: 34 | return 2 35 | 36 | @property 37 | def models(self) -> list[T]: 38 | return self.data 39 | 40 | 41 | class WrapperEnsemble(Ensemble[T], Wrapper[T]): 42 | """A wrapper ensemble is an ensemble composed of multiple copies of the same model. 43 | 44 | Parameters 45 | ---------- 46 | model 47 | The model to copy. 48 | n_models 49 | The number of copies to make. 50 | seed 51 | Random number generator seed for reproducibility. 52 | 53 | """ 54 | 55 | def __init__(self, model: T, n_models: int, seed: int | None) -> None: 56 | super().__init__(model.clone() for _ in range(n_models)) 57 | self.model = model 58 | self.n_models = n_models 59 | self.seed = seed 60 | self._rng = Random(seed) 61 | 62 | @property 63 | def _wrapped_model(self) -> T: 64 | return self.model 65 | -------------------------------------------------------------------------------- /river/base/tags.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | TEXT_INPUT = "text input" 4 | POSITIVE_INPUT = "positive input" 5 | -------------------------------------------------------------------------------- /river/base/typing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import typing 4 | 5 | FeatureName = typing.Hashable 6 | RegTarget = float 7 | ClfTarget = typing.Union[bool, str, int] # noqa: UP007 8 | Target = typing.Union[ClfTarget, RegTarget] # noqa: UP007 9 | Dataset = typing.Iterable[typing.Tuple[dict[FeatureName, typing.Any], typing.Any]] # noqa: UP006 10 | Stream = typing.Iterator[typing.Tuple[dict[FeatureName, typing.Any], typing.Any]] # noqa: UP006 11 | 12 | 13 | # These classes aim to provide the first blocks towards using protocols. 14 | # They should be modified if needed. 15 | class Learner(typing.Protocol): 16 | def learn_one(self, x: dict[FeatureName, typing.Any], y: Target) -> None: ... 17 | 18 | 19 | class Predictor(Learner, typing.Protocol): 20 | def predict_one(self, x: dict[FeatureName, typing.Any]) -> Target: ... 21 | -------------------------------------------------------------------------------- /river/base/wrapper.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Generic, TypeVar 5 | 6 | from river import base 7 | 8 | from .estimator import Estimator # Prevent a circular import of module base 9 | 10 | T = TypeVar("T", bound=Estimator) 11 | 12 | 13 | class Wrapper(ABC, Generic[T]): 14 | """A wrapper model.""" 15 | 16 | @property 17 | @abstractmethod 18 | def _wrapped_model(self) -> T: 19 | """Provides access to the wrapped model.""" 20 | 21 | @property 22 | def _labelloc(self) -> str: 23 | """Indicates location of the wrapper name when drawing pipelines.""" 24 | return "t" # for top 25 | 26 | def __str__(self) -> str: 27 | return f"{type(self).__name__}({self._wrapped_model})" 28 | 29 | def _more_tags(self) -> set[str]: 30 | return self._wrapped_model._tags 31 | 32 | @property 33 | def _supervised(self) -> bool: 34 | return self._wrapped_model._supervised 35 | 36 | @property 37 | def _multiclass(self) -> bool: 38 | return isinstance(self._wrapped_model, base.Classifier) and self._wrapped_model._multiclass 39 | -------------------------------------------------------------------------------- /river/checks/anomaly.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def check_roc_auc(anomaly_detector, dataset): 5 | """The ROC AUC should always be above 50%.""" 6 | 7 | from sklearn import metrics 8 | 9 | scores = [] 10 | labels = [] 11 | 12 | for x, y in dataset: 13 | anomaly_detector.learn_one(x) 14 | y_pred = anomaly_detector.score_one(x) 15 | 16 | scores.append(y_pred) 17 | labels.append(y) 18 | 19 | assert metrics.roc_auc_score(labels, scores) >= 0.5 20 | -------------------------------------------------------------------------------- /river/checks/clf.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import copy 4 | import math 5 | 6 | 7 | def check_predict_proba_one(classifier, dataset): 8 | """predict_proba_one should return a valid probability distribution and be pure.""" 9 | 10 | from river import utils 11 | 12 | if not hasattr(classifier, "predict_proba_one"): 13 | return 14 | 15 | for x, y in dataset: 16 | xx, yy = copy.deepcopy(x), copy.deepcopy(y) 17 | 18 | classifier.learn_one(x, y) 19 | y_pred = classifier.predict_proba_one(x) 20 | 21 | if utils.inspect.isactivelearner(classifier): 22 | y_pred, _ = y_pred 23 | 24 | # Check the probabilities are coherent 25 | assert isinstance(y_pred, dict) 26 | for proba in y_pred.values(): 27 | assert 0.0 <= proba <= 1.0 28 | assert math.isclose(sum(y_pred.values()), 1.0) 29 | 30 | # Check predict_proba_one is pure (i.e. x and y haven't changed) 31 | assert x == xx 32 | assert y == yy 33 | 34 | 35 | def check_predict_proba_one_binary(classifier, dataset): 36 | """predict_proba_one should return a dict with True and False keys.""" 37 | 38 | for x, y in dataset: 39 | y_pred = classifier.predict_proba_one(x) 40 | classifier.learn_one(x, y) 41 | assert set(y_pred.keys()) == {False, True} 42 | 43 | 44 | def check_multiclass_is_bool(model): 45 | assert isinstance(model._multiclass, bool) 46 | -------------------------------------------------------------------------------- /river/checks/model_selection.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import itertools 4 | 5 | 6 | def check_model_selection_order_does_not_matter(model, dataset): 7 | best_params = [] 8 | permutations = list(itertools.permutations(model.models)) 9 | datasets = itertools.tee(dataset, len(permutations)) 10 | 11 | for permutation, dataset in zip(permutations, datasets): 12 | models = [model.clone() for model in permutation] 13 | clone = model.clone(new_params={"models": models}) 14 | for x, y in dataset: 15 | clone.predict_one(x) 16 | clone.learn_one(x, y) 17 | best_params.append(clone.best_model._get_params()) 18 | 19 | # Check that the best params are always the same 20 | assert all(params == best_params[0] for params in best_params) 21 | -------------------------------------------------------------------------------- /river/checks/reco.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import random 4 | 5 | 6 | def check_reco_routine(ranker): 7 | users = ["Tom", "Anna"] 8 | items = {"politics", "sports", "music", "food", "finance", "health", "camping"} 9 | 10 | def get_reward(user, item) -> bool: 11 | if user == "Tom": 12 | return item in {"music", "politics"} 13 | return item in {"politics", "sports"} 14 | 15 | for i in range(100): 16 | user = random.choice(users) 17 | item = ranker.rank(user, items)[0] 18 | 19 | clicked = get_reward(user, item) 20 | 21 | ranker.learn_one(user, item, clicked) 22 | -------------------------------------------------------------------------------- /river/checks/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import inspect 4 | import math 5 | 6 | 7 | def assert_predictions_are_close(y1, y2): 8 | if isinstance(y1, dict): 9 | for k in y1: 10 | assert_predictions_are_close(y1[k], y2[k]) 11 | elif isinstance(y1, float): 12 | assert math.isclose(y1, y2, rel_tol=1e-06) 13 | else: 14 | assert y1 == y2 15 | 16 | 17 | def seed_params(params, seed): 18 | """Looks for "seed" keys and sets the value.""" 19 | 20 | def is_class_param(param): 21 | return isinstance(param, tuple) and inspect.isclass(param[0]) and isinstance(param[1], dict) 22 | 23 | if is_class_param(params): 24 | return params[0], seed_params(params[1], seed) 25 | 26 | if not isinstance(params, dict): 27 | return params 28 | 29 | return { 30 | name: seed if name == "seed" else seed_params(param, seed) for name, param in params.items() 31 | } 32 | -------------------------------------------------------------------------------- /river/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | """Unsupervised clustering.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .clustream import CluStream 6 | from .dbstream import DBSTREAM 7 | from .denstream import DenStream 8 | from .k_means import KMeans 9 | from .odac import ODAC 10 | from .streamkmeans import STREAMKMeans 11 | from .textclust import TextClust 12 | 13 | __all__ = ["CluStream", "DBSTREAM", "DenStream", "KMeans", "ODAC", "STREAMKMeans", "TextClust"] 14 | -------------------------------------------------------------------------------- /river/compat/__init__.py: -------------------------------------------------------------------------------- 1 | """Compatibility tools. 2 | 3 | This module contains adapters for making River estimators compatible with other libraries, and 4 | vice-versa whenever possible. The relevant adapters will only be usable if you have installed the 5 | necessary library. For instance, you have to install scikit-learn in order to use the 6 | `compat.convert_sklearn_to_river` function. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | __all__: list[str] = [] 13 | 14 | try: 15 | from .river_to_sklearn import ( 16 | River2SKLClassifier, 17 | River2SKLClusterer, 18 | River2SKLRegressor, 19 | River2SKLTransformer, 20 | convert_river_to_sklearn, 21 | ) 22 | from .sklearn_to_river import SKL2RiverClassifier, SKL2RiverRegressor, convert_sklearn_to_river 23 | 24 | __all__ += [ 25 | "convert_river_to_sklearn", 26 | "convert_sklearn_to_river", 27 | "River2SKLRegressor", 28 | "River2SKLClassifier", 29 | "River2SKLClusterer", 30 | "River2SKLTransformer", 31 | "SKL2RiverClassifier", 32 | "SKL2RiverRegressor", 33 | ] 34 | except ModuleNotFoundError: 35 | pass 36 | -------------------------------------------------------------------------------- /river/compose/__init__.py: -------------------------------------------------------------------------------- 1 | """Model composition. 2 | 3 | This module contains utilities for merging multiple modeling steps into a single pipeline. Although 4 | pipelines are not the only way to process a stream of data, we highly encourage you to use them. 5 | 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from .func import FuncTransformer 11 | from .grouper import Grouper 12 | from .pipeline import Pipeline, learn_during_predict 13 | from .product import TransformerProduct 14 | from .renamer import Prefixer, Renamer, Suffixer 15 | from .select import Discard, Select, SelectType 16 | from .target_transform import TargetTransformRegressor 17 | from .union import TransformerUnion 18 | 19 | __all__ = [ 20 | "Discard", 21 | "FuncTransformer", 22 | "Grouper", 23 | "Pipeline", 24 | "Prefixer", 25 | "pure_inference_mode", 26 | "Renamer", 27 | "Select", 28 | "SelectType", 29 | "Suffixer", 30 | "TargetTransformRegressor", 31 | "TransformerProduct", 32 | "TransformerUnion", 33 | "learn_during_predict", 34 | ] 35 | -------------------------------------------------------------------------------- /river/compose/grouper.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import collections 4 | import copy 5 | import functools 6 | 7 | from river import base 8 | 9 | __all__ = ["Grouper"] 10 | 11 | 12 | class Grouper(base.Transformer): 13 | """Applies a transformer within different groups. 14 | 15 | This transformer allows you to split your data into groups and apply a transformer within each 16 | group. This happens in a streaming manner, which means that the groups are discovered online. 17 | A separate copy of the provided transformer is made whenever a new group appears. The groups 18 | are defined according to one or more keys. 19 | 20 | Parameters 21 | ---------- 22 | transformer 23 | by 24 | The field on which to group the data. This can either by a single value, or a list of 25 | values. 26 | 27 | """ 28 | 29 | def __init__( 30 | self, 31 | transformer: base.BaseTransformer, 32 | by: base.typing.FeatureName | list[base.typing.FeatureName], 33 | ): 34 | self.transformer = transformer 35 | self.by = by if isinstance(by, list) else [by] 36 | self.transformers: collections.defaultdict = collections.defaultdict( 37 | functools.partial(copy.deepcopy, transformer) 38 | ) 39 | 40 | def _get_key(self, x): 41 | return "_".join(str(x[k]) for k in self.by) 42 | 43 | def learn_one(self, x): 44 | key = self._get_key(x) 45 | self.transformers[key].learn_one(x) 46 | 47 | def transform_one(self, x): 48 | key = self._get_key(x) 49 | return self.transformers[key].transform_one(x) 50 | -------------------------------------------------------------------------------- /river/conf/__init__.py: -------------------------------------------------------------------------------- 1 | """Conformal predictions. This modules contains wrappers to enable conformal predictions on any 2 | regressor or classifier.""" 3 | 4 | from __future__ import annotations 5 | 6 | from .interval import Interval 7 | from .jackknife import RegressionJackknife 8 | 9 | __all__ = [ 10 | "Interval", 11 | "RegressionJackknife", 12 | ] 13 | -------------------------------------------------------------------------------- /river/conf/interval.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import dataclasses 4 | 5 | 6 | @dataclasses.dataclass 7 | class Interval: 8 | """An object to represent a (prediction) interval. 9 | 10 | Users are not expected to use this class as-is. Instead, they should use the `with_interval` 11 | parameter of the `predict_one` method of any regressor or classifier wrapped with a conformal 12 | prediction method. 13 | 14 | Parameters 15 | ---------- 16 | lower 17 | The lower bound of the interval. 18 | upper 19 | The upper bound of the interval. 20 | 21 | """ 22 | 23 | lower: float 24 | upper: float 25 | 26 | @property 27 | def center(self): 28 | """The center of the interval.""" 29 | return (self.lower + self.upper) / 2 30 | 31 | @property 32 | def width(self): 33 | """The width of the interval.""" 34 | return self.upper - self.lower 35 | 36 | def __contains__(self, x): 37 | return self.lower <= x <= self.upper 38 | -------------------------------------------------------------------------------- /river/conftest.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | collect_ignore = [] 4 | 5 | try: 6 | import sklearn # noqa: F401 7 | except ImportError: 8 | collect_ignore.append("compat/test_sklearn.py") 9 | 10 | try: 11 | import sqlalchemy # noqa: F401 12 | except ImportError: 13 | collect_ignore.append("stream/iter_sql.py") 14 | collect_ignore.append("stream/test_sql.py") 15 | 16 | try: 17 | import vaex # noqa: F401 18 | except ImportError: 19 | collect_ignore.append("stream/iter_vaex.py") 20 | -------------------------------------------------------------------------------- /river/covariance/__init__.py: -------------------------------------------------------------------------------- 1 | """Online estimation of covariance and precision matrices.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .emp import EmpiricalCovariance, EmpiricalPrecision 6 | 7 | __all__ = ["EmpiricalCovariance", "EmpiricalPrecision"] 8 | -------------------------------------------------------------------------------- /river/datasets/airline_passengers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class AirlinePassengers(base.FileDataset): 9 | """Monthly number of international airline passengers. 10 | 11 | The stream contains 144 items and only one single feature, which is the month. The goal is to 12 | predict the number of passengers each month by capturing the trend and the seasonality of the 13 | data. 14 | 15 | References 16 | ---------- 17 | [^1]: [International airline passengers: monthly totals in thousands. Jan 49 – Dec 60](https://rdrr.io/r/datasets/AirPassengers.html) 18 | 19 | """ 20 | 21 | def __init__(self): 22 | super().__init__( 23 | filename="airline-passengers.csv", 24 | task=base.REG, 25 | n_features=1, 26 | n_samples=144, 27 | ) 28 | 29 | def __iter__(self): 30 | return stream.iter_csv( 31 | self.path, 32 | target="passengers", 33 | converters={"passengers": int}, 34 | parse_dates={"month": "%Y-%m"}, 35 | ) 36 | -------------------------------------------------------------------------------- /river/datasets/banana.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/river/datasets/banana.zip -------------------------------------------------------------------------------- /river/datasets/bananas.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class Bananas(base.FileDataset): 9 | """Bananas dataset. 10 | 11 | An artificial dataset where instances belongs to several clusters with a banana shape. 12 | There are two attributes that correspond to the x and y axis, respectively. 13 | 14 | References 15 | ---------- 16 | [^1]: [OpenML page](https://www.openml.org/d/1460) 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__(filename="banana.zip", n_samples=5300, n_features=2, task=base.BINARY_CLF) 22 | 23 | def __iter__(self): 24 | return stream.iter_libsvm(self.path, target_type=lambda x: x == "1") 25 | -------------------------------------------------------------------------------- /river/datasets/bikes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class Bikes(base.RemoteDataset): 9 | """Bike sharing station information from the city of Toulouse. 10 | 11 | The goal is to predict the number of bikes in 5 different bike stations from the city of 12 | Toulouse. 13 | 14 | References 15 | ---------- 16 | [^1]: [A short introduction and conclusion to the OpenBikes 2016 Challenge](https://maxhalford.github.io/blog/openbikes-challenge/) 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | url="https://maxhalford.github.io/files/datasets/toulouse_bikes.zip", 23 | size=13_125_015, 24 | n_samples=182_470, 25 | n_features=8, 26 | task=base.REG, 27 | filename="toulouse_bikes.csv", 28 | ) 29 | 30 | def _iter(self): 31 | return stream.iter_csv( 32 | self.path, 33 | target="bikes", 34 | converters={ 35 | "clouds": int, 36 | "humidity": int, 37 | "pressure": float, 38 | "temperature": float, 39 | "wind": float, 40 | "bikes": int, 41 | }, 42 | parse_dates={"moment": "%Y-%m-%d %H:%M:%S"}, 43 | ) 44 | -------------------------------------------------------------------------------- /river/datasets/chick_weights.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class ChickWeights(base.FileDataset): 9 | """Chick weights along time. 10 | 11 | The stream contains 578 items and 3 features. The goal is to predict the weight of each chick 12 | along time, according to the diet the chick is on. The data is ordered by time and then by 13 | chick. 14 | 15 | References 16 | ---------- 17 | [^1]: [Chick weight dataset overview](http://rstudio-pubs-static.s3.amazonaws.com/107631_131ad1c022df4f90aa2d214a5c5609b2.html) 18 | 19 | """ 20 | 21 | def __init__(self): 22 | super().__init__(filename="chick-weights.csv", n_samples=578, n_features=3, task=base.REG) 23 | 24 | def __iter__(self): 25 | return stream.iter_csv( 26 | self.path, 27 | target="weight", 28 | converters={"time": int, "weight": int, "chick": int, "diet": int}, 29 | ) 30 | -------------------------------------------------------------------------------- /river/datasets/http.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class HTTP(base.RemoteDataset): 9 | """HTTP dataset of the KDD 1999 cup. 10 | 11 | The goal is to predict whether or not an HTTP connection is anomalous or not. The dataset only 12 | contains 2,211 (0.4%) positive labels. 13 | 14 | References 15 | ---------- 16 | [^1]: [HTTP (KDDCUP99) dataset](http://odds.cs.stonybrook.edu/http-kddcup99-dataset/) 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | n_samples=567_498, 23 | n_features=3, 24 | task=base.BINARY_CLF, 25 | url="https://maxhalford.github.io/files/datasets/kdd99_http.zip", 26 | size=32_400_738, 27 | filename="kdd99_http.csv", 28 | ) 29 | 30 | def _iter(self): 31 | converters = { 32 | "duration": float, 33 | "src_bytes": float, 34 | "dst_bytes": float, 35 | "service": int, 36 | } 37 | return stream.iter_csv(self.path, target="service", converters=converters) 38 | -------------------------------------------------------------------------------- /river/datasets/malicious_url.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import os 4 | 5 | from . import base 6 | 7 | 8 | class MaliciousURL(base.RemoteDataset): 9 | """Malicious URLs dataset. 10 | 11 | This dataset contains features about URLs that are classified as malicious or not. 12 | 13 | References 14 | ---------- 15 | [^1]: [Detecting Malicious URLs](http://www.sysnet.ucsd.edu/projects/url/) 16 | [^2]: [Identifying Suspicious URLs: An Application of Large-Scale Online Learning](http://cseweb.ucsd.edu/~jtma/papers/url-icml2009.pdf) 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | n_samples=2_396_130, 23 | n_features=3_231_961, 24 | task=base.BINARY_CLF, 25 | url="http://www.sysnet.ucsd.edu/projects/url/url_svmlight.tar.gz", 26 | filename="url_svmlight", 27 | size=2_210_273_352, 28 | sparse=True, 29 | ) 30 | 31 | def _iter(self): 32 | files = list(self.path.glob("Day*.svm")) 33 | files.sort(key=lambda x: int(os.path.basename(x).split(".")[0][3:])) 34 | 35 | def parse_libsvm_feature(f): 36 | k, v = f.split(":") 37 | return int(k), float(v) 38 | 39 | # There are 150 files with each one corresponding to a day 40 | for file in files: 41 | with open(file) as f: 42 | for line in f: 43 | # Each file has the libsvm format 44 | elements = line.rstrip().split(" ") 45 | y = elements.pop(0) == "+1" 46 | x = dict(parse_libsvm_feature(f) for f in elements) 47 | yield x, y 48 | -------------------------------------------------------------------------------- /river/datasets/phishing.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/river/datasets/phishing.csv.gz -------------------------------------------------------------------------------- /river/datasets/phishing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class Phishing(base.FileDataset): 9 | """Phishing websites. 10 | 11 | This dataset contains features from web pages that are classified as phishing or not. 12 | 13 | References 14 | ---------- 15 | [^1]: [UCI page](http://archive.ics.uci.edu/ml/datasets/Website+Phishing) 16 | 17 | """ 18 | 19 | def __init__(self) -> None: 20 | super().__init__( 21 | n_samples=1_250, 22 | n_features=9, 23 | task=base.BINARY_CLF, 24 | filename="phishing.csv.gz", 25 | ) 26 | 27 | def __iter__(self): 28 | return stream.iter_csv( 29 | self.path, 30 | target="is_phishing", 31 | converters={ 32 | "empty_server_form_handler": float, 33 | "popup_window": float, 34 | "https": float, 35 | "request_from_other_domain": float, 36 | "anchor_from_other_domain": float, 37 | "is_popular": float, 38 | "long_url": float, 39 | "age_of_domain": int, 40 | "ip_in_url": int, 41 | "is_phishing": lambda x: x == "1", 42 | }, 43 | ) 44 | -------------------------------------------------------------------------------- /river/datasets/restaurants.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import ast 4 | 5 | from river import stream 6 | 7 | from . import base 8 | 9 | 10 | class Restaurants(base.RemoteDataset): 11 | """Data from the Kaggle Recruit Restaurants challenge. 12 | 13 | The goal is to predict the number of visitors in each of 829 Japanese restaurants over a period 14 | of roughly 16 weeks. The data is ordered by date and then by restaurant ID. 15 | 16 | References 17 | ---------- 18 | [^1]: [Recruit Restaurant Visitor Forecasting](https://www.kaggle.com/c/recruit-restaurant-visitor-forecasting) 19 | 20 | """ 21 | 22 | def __init__(self): 23 | super().__init__( 24 | n_samples=252_108, 25 | n_features=7, 26 | task=base.REG, 27 | url="https://maxhalford.github.io/files/datasets/kaggle_recruit_restaurants.zip", 28 | size=28_881_242, 29 | filename="kaggle_recruit_restaurants.csv", 30 | ) 31 | 32 | def _iter(self): 33 | return stream.iter_csv( 34 | self.path, 35 | target="visitors", 36 | converters={ 37 | "latitude": float, 38 | "longitude": float, 39 | "visitors": int, 40 | "is_holiday": ast.literal_eval, 41 | }, 42 | parse_dates={"date": "%Y-%m-%d"}, 43 | ) 44 | -------------------------------------------------------------------------------- /river/datasets/segment.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/river/datasets/segment.csv.zip -------------------------------------------------------------------------------- /river/datasets/segment.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class ImageSegments(base.FileDataset): 9 | """Image segments classification. 10 | 11 | This dataset contains features that describe image segments into 7 classes: brickface, sky, 12 | foliage, cement, window, path, and grass. 13 | 14 | References 15 | ---------- 16 | [^1]: [UCI page](https://archive.ics.uci.edu/ml/datasets/Statlog+(Image+Segmentation)) 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | n_samples=2_310, 23 | n_classes=7, 24 | n_features=18, 25 | task=base.MULTI_CLF, 26 | filename="segment.csv.zip", 27 | ) 28 | 29 | def __iter__(self): 30 | return stream.iter_csv( 31 | self.path, 32 | target="category", 33 | converters={ 34 | "region-centroid-col": int, 35 | "region-centroid-row": int, 36 | "short-line-density-5": float, 37 | "short-line-density-2": float, 38 | "vedge-mean": float, 39 | "vegde-sd": float, 40 | "hedge-mean": float, 41 | "hedge-sd": float, 42 | "intensity-mean": float, 43 | "rawred-mean": float, 44 | "rawblue-mean": float, 45 | "rawgreen-mean": float, 46 | "exred-mean": float, 47 | "exblue-mean": float, 48 | "exgreen-mean": float, 49 | "value-mean": float, 50 | "saturation-mean": float, 51 | "hue-mean": float, 52 | }, 53 | ) 54 | -------------------------------------------------------------------------------- /river/datasets/sms_spam.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from . import base 4 | 5 | 6 | class SMSSpam(base.RemoteDataset): 7 | """SMS Spam Collection dataset. 8 | 9 | The data contains 5,574 items and 1 feature (i.e. SMS body). Spam messages represent 10 | 13.4% of the dataset. The goal is to predict whether an SMS is a spam or not. 11 | 12 | References 13 | ---------- 14 | [^1]: [Almeida, T.A., Hidalgo, J.M.G. and Yamakami, A., 2011, September. Contributions to the study of SMS spam filtering: new collection and results. In Proceedings of the 11th ACM symposium on Document engineering (pp. 259-262).](http://www.dt.fee.unicamp.br/~tiago/smsspamcollection/doceng11.pdf) 15 | 16 | """ 17 | 18 | def __init__(self): 19 | super().__init__( 20 | n_samples=5_574, 21 | n_features=1, 22 | task=base.BINARY_CLF, 23 | url="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip", 24 | size=477_907, 25 | filename="SMSSpamCollection", 26 | ) 27 | 28 | def _iter(self): 29 | with open(self.path) as f: 30 | for row in f: 31 | label, body = row.split("\t") 32 | yield ({"body": body}, label == "spam") 33 | -------------------------------------------------------------------------------- /river/datasets/smtp.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class SMTP(base.RemoteDataset): 9 | """SMTP dataset from the KDD 1999 cup. 10 | 11 | The goal is to predict whether or not an SMTP connection is anomalous or not. The dataset only 12 | contains 2,211 (0.4%) positive labels. 13 | 14 | References 15 | ---------- 16 | [^1]: [SMTP (KDDCUP99) dataset](http://odds.cs.stonybrook.edu/smtp-kddcup99-dataset/) 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | n_samples=95_156, 23 | n_features=3, 24 | task=base.BINARY_CLF, 25 | url="https://maxhalford.github.io/files/datasets/smtp.zip", 26 | size=5_484_982, 27 | filename="smtp.csv", 28 | ) 29 | 30 | def _iter(self): 31 | return stream.iter_csv( 32 | self.path, 33 | target="service", 34 | converters={ 35 | "duration": float, 36 | "src_bytes": float, 37 | "dst_bytes": float, 38 | "service": int, 39 | }, 40 | ) 41 | -------------------------------------------------------------------------------- /river/datasets/solar-flare.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/river/datasets/solar-flare.csv.zip -------------------------------------------------------------------------------- /river/datasets/solar_flare.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class SolarFlare(base.FileDataset): 9 | """Solar flare multi-output regression. 10 | 11 | References 12 | ---------- 13 | [^1]: [UCI page](https://archive.ics.uci.edu/ml/datasets/Solar+Flare) 14 | 15 | """ 16 | 17 | def __init__(self): 18 | super().__init__( 19 | n_samples=1_066, 20 | n_features=10, 21 | n_outputs=3, 22 | task=base.MO_REG, 23 | filename="solar-flare.csv.zip", 24 | ) 25 | 26 | def __iter__(self): 27 | return stream.iter_csv( 28 | self.path, 29 | target=["c-class-flares", "m-class-flares", "x-class-flares"], 30 | converters={ 31 | "zurich-class": str, 32 | "largest-spot-size": str, 33 | "spot-distribution": str, 34 | "activity": int, 35 | "evolution": int, 36 | "previous-24h-flare-activity": int, 37 | "hist-complex": int, 38 | "hist-complex-this-pass": int, 39 | "area": int, 40 | "largest-spot-area": int, 41 | "c-class-flares": int, 42 | "m-class-flares": int, 43 | "x-class-flares": int, 44 | }, 45 | ) 46 | -------------------------------------------------------------------------------- /river/datasets/taxis.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class Taxis(base.RemoteDataset): 9 | """Taxi ride durations in New York City. 10 | 11 | The goal is to predict the duration of taxi rides in New York City. 12 | 13 | References 14 | ---------- 15 | [^1]: [New York City Taxi Trip Duration competition on Kaggle](https://www.kaggle.com/c/nyc-taxi-trip-duration) 16 | 17 | """ 18 | 19 | def __init__(self): 20 | super().__init__( 21 | n_samples=1_458_644, 22 | n_features=8, 23 | task=base.REG, 24 | url="https://maxhalford.github.io/files/datasets/nyc_taxis.zip", 25 | size=195_271_696, 26 | filename="train.csv", 27 | ) 28 | 29 | def _iter(self): 30 | return stream.iter_csv( 31 | self.path, 32 | target="trip_duration", 33 | converters={ 34 | "passenger_count": int, 35 | "pickup_longitude": float, 36 | "pickup_latitude": float, 37 | "dropoff_longitude": float, 38 | "dropoff_latitude": float, 39 | "trip_duration": int, 40 | }, 41 | parse_dates={"pickup_datetime": "%Y-%m-%d %H:%M:%S"}, 42 | drop=["dropoff_datetime", "id"], 43 | ) 44 | -------------------------------------------------------------------------------- /river/datasets/trec07.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class TREC07(base.RemoteDataset): 9 | """TREC's 2007 Spam Track dataset. 10 | 11 | The data contains 75,419 chronologically ordered items, i.e. 3 months of emails delivered 12 | to a particular server in 2007. Spam messages represent 66.6% of the dataset. 13 | The goal is to predict whether an email is a spam or not. 14 | 15 | The available raw features are: sender, recipients, date, subject, body. 16 | 17 | References 18 | ---------- 19 | [^1]: [TREC 2007 Spam Track Overview](https://trec.nist.gov/pubs/trec16/papers/SPAM.OVERVIEW16.pdf) 20 | [^2]: [Code ran to parse the dataset](https://gist.github.com/gbolmier/b6a942699aaaedec54041a32e4f34d40) 21 | 22 | """ 23 | 24 | def __init__(self): 25 | super().__init__( 26 | n_samples=75_419, 27 | n_features=5, 28 | task=base.BINARY_CLF, 29 | url="https://maxhalford.github.io/files/datasets/trec07p.zip", 30 | size=144_504_829, 31 | filename="trec07p.csv", 32 | ) 33 | 34 | def _iter(self): 35 | return stream.iter_csv( 36 | self.path, 37 | target="y", 38 | delimiter=",", 39 | quotechar='"', 40 | field_size_limit=1_000_000, 41 | ) 42 | -------------------------------------------------------------------------------- /river/datasets/trump_approval.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/river/datasets/trump_approval.csv.gz -------------------------------------------------------------------------------- /river/datasets/trump_approval.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class TrumpApproval(base.FileDataset): 9 | """Donald Trump approval ratings. 10 | 11 | This dataset was obtained by reshaping the data used by FiveThirtyEight for analyzing Donald 12 | Trump's approval ratings. It contains 5 features, which are approval ratings collected by 13 | 5 polling agencies. The target is the approval rating from FiveThirtyEight's model. The goal of 14 | this task is to see if we can reproduce FiveThirtyEight's model. 15 | 16 | References 17 | ---------- 18 | [^1]: [Trump Approval Ratings](https://projects.fivethirtyeight.com/trump-approval-ratings/) 19 | 20 | """ 21 | 22 | def __init__(self): 23 | super().__init__( 24 | n_samples=1_001, 25 | n_features=6, 26 | task=base.REG, 27 | filename="trump_approval.csv.gz", 28 | ) 29 | 30 | def __iter__(self): 31 | return stream.iter_csv( 32 | self.path, 33 | target="five_thirty_eight", 34 | converters={ 35 | "ordinal_date": int, 36 | "gallup": float, 37 | "ipsos": float, 38 | "morning_consult": float, 39 | "rasmussen": float, 40 | "you_gov": float, 41 | "five_thirty_eight": float, 42 | }, 43 | ) 44 | -------------------------------------------------------------------------------- /river/datasets/water_flow.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stream 4 | 5 | from . import base 6 | 7 | 8 | class WaterFlow(base.FileDataset): 9 | """Water flow through a pipeline branch. 10 | 11 | The series includes hourly values for about 2 months, March 2022 to May 2022. The values are 12 | expressed in liters per second. There are four anomalous segments in the series: 13 | 14 | * 3 "low value moments": this is due to water losses or human intervention for maintenance 15 | * A small peak in the water inflow after the first 2 segments: this is due to a pumping 16 | operation into the main pipeline, when more water pressure is needed 17 | 18 | This dataset is well suited for time series forecasting models, as well as anomaly detection 19 | methods. Ideally, the goal is to build a time series forecasting model that is robust to the 20 | anomalous segments. 21 | 22 | This data has been kindly donated by the Tecnojest s.r.l. company (www.invidea.it) from Italy. 23 | 24 | """ 25 | 26 | def __init__(self): 27 | super().__init__( 28 | filename="water-flow.csv", 29 | task=base.REG, 30 | n_features=1, 31 | n_samples=1_268, 32 | ) 33 | 34 | def __iter__(self): 35 | return stream.iter_csv( 36 | self.path, 37 | target="Water flow [l/s]", 38 | converters={"Water flow [l/s]": float}, 39 | parse_dates={"Time": "%Y-%m-%dT%H:%M:%S%z"}, 40 | ) 41 | -------------------------------------------------------------------------------- /river/drift/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Concept Drift Detection. 3 | 4 | This module contains concept drift detection methods. The purpose of a drift detector is to raise 5 | an alarm if the data distribution changes. A good drift detector method is the one that maximizes 6 | the true positives while keeping the number of false positives to a minimum. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from . import binary, datasets 13 | from .adwin import ADWIN 14 | from .dummy import DummyDriftDetector 15 | from .kswin import KSWIN 16 | from .no_drift import NoDrift 17 | from .page_hinkley import PageHinkley 18 | from .retrain import DriftRetrainingClassifier 19 | 20 | __all__ = [ 21 | "binary", 22 | "datasets", 23 | "ADWIN", 24 | "DriftRetrainingClassifier", 25 | "DummyDriftDetector", 26 | "KSWIN", 27 | "NoDrift", 28 | "PageHinkley", 29 | "PeriodicTrigger", 30 | ] 31 | -------------------------------------------------------------------------------- /river/drift/adwin_c.pyi: -------------------------------------------------------------------------------- 1 | class AdaptiveWindowing: 2 | def __init__( 3 | self, 4 | delta: float = 0.002, 5 | clock: int = 32, 6 | max_buckets: int = 5, 7 | min_window_length: int = 5, 8 | grace_period: int = 10, 9 | ) -> None: ... 10 | def get_n_detections(self) -> int: ... 11 | def get_width(self) -> float: ... 12 | def get_total(self) -> float: ... 13 | def get_variance(self) -> float: ... 14 | @property 15 | def variance_in_window(self) -> float: ... 16 | def update(self, value: float) -> bool: ... 17 | 18 | class Bucket: 19 | def __init__(self, max_size: int) -> None: ... 20 | def clear_at(self, index: int) -> None: ... 21 | def insert_data(self, value: float, variance: float) -> None: ... 22 | def remove(self) -> None: ... 23 | def compress(self, n_elements: int) -> None: ... 24 | def get_total_at(self, index: int) -> float: ... 25 | def get_variance_at(self, index: int) -> float: ... 26 | def set_total_at(self, value: float, index: int) -> None: ... 27 | def set_variance_at(self, value: float, index: int) -> None: ... 28 | -------------------------------------------------------------------------------- /river/drift/binary/__init__.py: -------------------------------------------------------------------------------- 1 | """Drift detection for binary data.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .ddm import DDM 6 | from .eddm import EDDM 7 | from .fhddm import FHDDM 8 | from .hddm_a import HDDM_A 9 | from .hddm_w import HDDM_W 10 | 11 | __all__ = ["DDM", "EDDM", "FHDDM", "HDDM_A", "HDDM_W"] 12 | -------------------------------------------------------------------------------- /river/drift/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .airline_passengers import AirlinePassengers 4 | from .apple import Apple 5 | from .bitcoin import Bitcoin 6 | from .brent_crude_oil import BrentSpotPrice 7 | from .occupancy import Occupancy 8 | from .run_log import RunLog 9 | from .uk_coal_employment import UKCoalEmploy 10 | 11 | __all__ = [ 12 | "Bitcoin", 13 | "BrentSpotPrice", 14 | "UKCoalEmploy", 15 | "AirlinePassengers", 16 | "RunLog", 17 | "Occupancy", 18 | "Apple", 19 | ] 20 | -------------------------------------------------------------------------------- /river/drift/datasets/airline_passengers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, stream 4 | 5 | from .base import ChangePointFileDataset 6 | 7 | 8 | class AirlinePassengers(ChangePointFileDataset): 9 | """JFK Airline Passengers 10 | 11 | This dataset gives the number of passengers arriving and departing at JFK. 12 | The data is obtained from New York State's official Kaggle page for this dataset. 13 | 14 | References 15 | ---------- 16 | [^1]: https://www.kaggle.com/new-york-state/nys-air-passenger-traffic,-port-authority-of-ny-nj#air-passenger-traffic-per-month-port-authority-of-ny-nj-beginning-1977.csv 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | annotations={"6": [299], "7": [], "8": [302], "9": [326, 382], "10": [296]}, 23 | filename="airline_passengers.csv", 24 | task=datasets.base.REG, 25 | n_samples=468, 26 | n_features=1, 27 | ) 28 | 29 | def __iter__(self): 30 | return stream.iter_csv( 31 | self.path, 32 | target="Total Passengers", 33 | converters={ 34 | "Total Passengers": int, 35 | }, 36 | parse_dates={"date": "%Y-%b"}, 37 | ) 38 | -------------------------------------------------------------------------------- /river/drift/datasets/apple.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, stream 4 | 5 | from .base import ChangePointFileDataset 6 | 7 | 8 | class Apple(ChangePointFileDataset): 9 | """Apple Stock 10 | 11 | This dataset concerns the daily close price and volume of Apple stock around the year 2000. The dataset is sampled every 3 observations to reduce the length of the time series. 12 | This dataset is retrieved from Yahoo Finance. 13 | 14 | References 15 | ---------- 16 | [^1]: https://finance.yahoo.com/quote/AAPL/history?period1=850348800&period2=1084579200&interval=1d&filter=history&frequency=1d 17 | 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | annotations={ 23 | "6": [319], 24 | "7": [319], 25 | "8": [319], 26 | "9": [53, 90, 197, 276, 319, 403, 463, 535], 27 | "10": [319], 28 | }, 29 | filename="apple.csv", 30 | task=datasets.base.REG, 31 | n_samples=1867, 32 | n_features=6, 33 | ) 34 | 35 | def __iter__(self): 36 | return stream.iter_csv( 37 | self.path, 38 | target=["Open", "High", "Low", "Close", "Adj Close", "Volume"], 39 | converters={ 40 | "Open": float, 41 | "High": float, 42 | "Low": float, 43 | "Close": float, 44 | "Adj Close": float, 45 | "Volume": float, 46 | }, 47 | parse_dates={"Date": "%Y-%m-%d"}, 48 | ) 49 | -------------------------------------------------------------------------------- /river/drift/datasets/bitcoin.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, stream 4 | 5 | from .base import ChangePointFileDataset 6 | 7 | 8 | class Bitcoin(ChangePointFileDataset): 9 | """Bitcoin Market Price 10 | 11 | This is a regression task, where the goal is to predict the average USD market price across 12 | major bitcoin exchanges. This data was collected from the official Blockchain website. There 13 | is only one feature given, the day of exchange, which is in increments of three. The first 14 | 500 lines have been removed because they are not interesting. 15 | 16 | References 17 | ---------- 18 | [^1]: https://www.blockchain.com/fr/explorer/charts/market-price?timespan=all 19 | 20 | """ 21 | 22 | def __init__(self): 23 | super().__init__( 24 | annotations={ 25 | "6": [502, 580, 702, 747], 26 | "8": [583], 27 | "12": [597], 28 | "13": [522, 579, 591, 629, 703, 747, 760], 29 | "14": [93, 522, 540, 701, 747, 760, 772], 30 | }, 31 | filename="bitcoin.csv", 32 | task=datasets.base.REG, 33 | n_samples=822, 34 | n_features=1, 35 | ) 36 | 37 | def __iter__(self): 38 | return stream.iter_csv( 39 | self.path, 40 | target="price", 41 | converters={ 42 | "price": float, 43 | }, 44 | parse_dates={"date": "%Y-%m-%d"}, 45 | ) 46 | -------------------------------------------------------------------------------- /river/drift/datasets/occupancy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, stream 4 | 5 | from .base import ChangePointFileDataset 6 | 7 | 8 | class Occupancy(ChangePointFileDataset): 9 | """Room occupancy data. 10 | 11 | Dataset on detecting room occupancy based on several variables. The dataset contains 12 | temperature, humidity, light, and CO2 variables. 13 | 14 | The data is sampled at every 16 observations to reduce the length of the series. 15 | 16 | References 17 | ---------- 18 | Candanedo, Luis M., and Véronique Feldheim. "Accurate occupancy detection of an office room from light, temperature, humidity and CO2 measurements using statistical learning models." Energy and Buildings 112 (2016): 28-39. 19 | 20 | """ 21 | 22 | def __init__(self): 23 | super().__init__( 24 | annotations={ 25 | "6": [238, 416], 26 | "8": [53, 143, 238, 417], 27 | "9": [53, 92, 142, 181, 236, 264, 341, 416, 436, 451, 506], 28 | "10": [1, 52, 91, 142, 181, 234, 267, 324, 360, 416, 451, 506], 29 | "12": [234, 415], 30 | }, 31 | filename="occupancy.csv", 32 | task=datasets.base.REG, 33 | n_samples=509, 34 | n_features=4, 35 | ) 36 | 37 | def __iter__(self): 38 | return stream.iter_csv( 39 | self.path, 40 | target=["V1", "V2", "V3", "V4"], 41 | converters={ 42 | "V1": float, 43 | "V2": float, 44 | "V3": float, 45 | "V4": float, 46 | }, 47 | parse_dates={"time": "%Y-%m-%d %H:%M:%S"}, 48 | ) 49 | -------------------------------------------------------------------------------- /river/drift/datasets/run_log.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, stream 4 | 5 | from .base import ChangePointFileDataset 6 | 7 | 8 | class RunLog(ChangePointFileDataset): 9 | """Interval Training Running Pace. 10 | 11 | This dataset shows the pace of a runner during an interval training session, where a mobile 12 | application provides instructions on when to run and when to walk. 13 | 14 | """ 15 | 16 | def __init__(self): 17 | super().__init__( 18 | annotations={ 19 | "6": [60, 96, 114, 174, 204, 240, 258, 317], 20 | "7": [60, 96, 114, 177, 204, 240, 258, 317], 21 | "8": [60, 96, 114, 174, 204, 240, 258, 317], 22 | "10": [2, 60, 96, 114, 174, 204, 240, 258, 317], 23 | "12": [], 24 | }, 25 | filename="run_log.csv", 26 | task=datasets.base.REG, 27 | n_samples=376, 28 | n_features=2, 29 | ) 30 | 31 | def __iter__(self): 32 | return stream.iter_csv( 33 | self.path, 34 | target=["Pace", "Distance"], 35 | converters={"Pace": float, "Distance": float}, 36 | parse_dates={"time": "%Y-%m-%d %H:%M:%S"}, 37 | ) 38 | -------------------------------------------------------------------------------- /river/drift/datasets/uk_coal_employment.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, stream 4 | 5 | from .base import ChangePointFileDataset 6 | 7 | 8 | class UKCoalEmploy(ChangePointFileDataset): 9 | """Historic Employment in UK Coal Mines 10 | 11 | This is historic data obtained from the UK government. 12 | We use the employment column for the number of workers employed in the British coal mines 13 | Missing values in the data are replaced with the value of the preceding year. 14 | 15 | References 16 | ---------- 17 | [^1]: https://www.gov.uk/government/statistical-data-sets/historical-coal-data-coal-production-availability-and-consumption 18 | """ 19 | 20 | def __init__(self): 21 | super().__init__( 22 | annotations={ 23 | "6": [15, 28, 45, 60, 68, 80], 24 | "7": [18, 47, 81], 25 | "8": [], 26 | "9": [15, 27, 46, 68, 81], 27 | "13": [19, 28, 45, 68, 80], 28 | }, 29 | filename="uk_coal_employment.csv", 30 | task=datasets.base.REG, 31 | n_samples=105, 32 | n_features=1, 33 | ) 34 | 35 | def __iter__(self): 36 | return stream.iter_csv( 37 | self.path, 38 | target="Employment", 39 | converters={ 40 | "Employment": int, 41 | }, 42 | parse_dates={"Year": "%Y"}, 43 | ) 44 | -------------------------------------------------------------------------------- /river/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | """Ensemble learning. 2 | 3 | Broadly speaking, there are two kinds of ensemble approaches. There are those that copy a single 4 | model several times and aggregate the predictions of said copies. This includes bagging as well as 5 | boosting. Then there are those that are composed of an arbitrary list of models, and can therefore 6 | aggregate predictions from different kinds of models. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from .bagging import ( 13 | ADWINBaggingClassifier, 14 | BaggingClassifier, 15 | BaggingRegressor, 16 | LeveragingBaggingClassifier, 17 | ) 18 | from .boosting import AdaBoostClassifier, ADWINBoostingClassifier, BOLEClassifier 19 | from .ewa import EWARegressor 20 | from .stacking import StackingClassifier 21 | from .streaming_random_patches import SRPClassifier, SRPRegressor 22 | from .voting import VotingClassifier 23 | 24 | __all__ = [ 25 | "AdaBoostClassifier", 26 | "ADWINBaggingClassifier", 27 | "ADWINBoostingClassifier", 28 | "BaggingClassifier", 29 | "BaggingRegressor", 30 | "BOLEClassifier", 31 | "EWARegressor", 32 | "LeveragingBaggingClassifier", 33 | "SRPClassifier", 34 | "SRPRegressor", 35 | "StackingClassifier", 36 | "VotingClassifier", 37 | ] 38 | -------------------------------------------------------------------------------- /river/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | """Model evaluation. 2 | 3 | This module provides utilities to evaluate an online model. The goal is to reproduce a real-world 4 | scenario with high fidelity. The core function of this module is `progressive_val_score`, which 5 | allows to evaluate a model via progressive validation. 6 | 7 | This module also exposes "tracks". A track is a predefined combination of a dataset and one or more 8 | metrics. This allows a principled manner to compare models with each other. For instance, 9 | the `RegressionTrack` contains several datasets and metrics to evaluate regression models. There is 10 | also a bare `Track` class to implement a custom track. The `benchmarks` directory at the root of 11 | the River repository uses these tracks. 12 | 13 | """ 14 | 15 | from __future__ import annotations 16 | 17 | from .progressive_validation import iter_progressive_val_score, progressive_val_score 18 | from .tracks import BinaryClassificationTrack, MultiClassClassificationTrack, RegressionTrack, Track 19 | 20 | __all__ = [ 21 | "iter_progressive_val_score", 22 | "progressive_val_score", 23 | "BinaryClassificationTrack", 24 | "MultiClassClassificationTrack", 25 | "RegressionTrack", 26 | "Track", 27 | ] 28 | -------------------------------------------------------------------------------- /river/facto/__init__.py: -------------------------------------------------------------------------------- 1 | """Factorization machines.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .ffm import FFMClassifier, FFMRegressor 6 | from .fm import FMClassifier, FMRegressor 7 | from .fwfm import FwFMClassifier, FwFMRegressor 8 | from .hofm import HOFMClassifier, HOFMRegressor 9 | 10 | __all__ = [ 11 | "FFMClassifier", 12 | "FFMRegressor", 13 | "FMClassifier", 14 | "FMRegressor", 15 | "FwFMClassifier", 16 | "FwFMRegressor", 17 | "HOFMClassifier", 18 | "HOFMRegressor", 19 | ] 20 | -------------------------------------------------------------------------------- /river/feature_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | """Feature extraction. 2 | 3 | This module can be used to extract information from raw features. This includes encoding 4 | categorical data as well as looking at interactions between existing features. This differs from 5 | the `preprocessing` module, in that the latter's purpose is rather to clean the data so that it may 6 | be processed by a particular machine learning algorithm. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from .agg import Agg, TargetAgg 13 | from .kernel_approx import RBFSampler 14 | from .poly import PolynomialExtender 15 | from .vectorize import TFIDF, BagOfWords 16 | 17 | __all__ = [ 18 | "Agg", 19 | "BagOfWords", 20 | "PolynomialExtender", 21 | "RBFSampler", 22 | "TargetAgg", 23 | "TFIDF", 24 | ] 25 | -------------------------------------------------------------------------------- /river/feature_extraction/test_vectorize.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from river import feature_extraction 6 | 7 | 8 | @pytest.mark.parametrize( 9 | "params, text, expected_ngrams", 10 | [ 11 | pytest.param( 12 | *case, 13 | id=f"#{i}", 14 | ) 15 | for i, case in enumerate( 16 | [ 17 | ({}, "one two three", ["one", "two", "three"]), 18 | ( 19 | {}, 20 | """one two\tthree four\t\tfive 21 | six 22 | 23 | seven""", 24 | ["one", "two", "three", "four", "five", "six", "seven"], 25 | ), 26 | ( 27 | {"ngram_range": (1, 2)}, 28 | "one two three", 29 | ["one", "two", "three", ("one", "two"), ("two", "three")], 30 | ), 31 | ({"ngram_range": (2, 2)}, "one two three", [("one", "two"), ("two", "three")]), 32 | ( 33 | {"ngram_range": (2, 3)}, 34 | "one two three", 35 | [("one", "two"), ("two", "three"), ("one", "two", "three")], 36 | ), 37 | ({"stop_words": {"two", "three"}}, "one two three four", ["one", "four"]), 38 | ( 39 | {"stop_words": {"two", "three"}, "ngram_range": (1, 2)}, 40 | "one two three four", 41 | ["one", "four", ("one", "four")], 42 | ), 43 | ] 44 | ) 45 | ], 46 | ) 47 | def test_ngrams(params, text, expected_ngrams): 48 | bow = feature_extraction.BagOfWords(**params) 49 | ngrams = list(bow.process_text(text)) 50 | assert expected_ngrams == ngrams 51 | -------------------------------------------------------------------------------- /river/feature_selection/__init__.py: -------------------------------------------------------------------------------- 1 | """Feature selection.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .k_best import SelectKBest 6 | from .random import PoissonInclusion 7 | from .variance import VarianceThreshold 8 | 9 | __all__ = ["PoissonInclusion", "SelectKBest", "VarianceThreshold"] 10 | -------------------------------------------------------------------------------- /river/forest/__init__.py: -------------------------------------------------------------------------------- 1 | """This module implements forest-based classifiers and regressors.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .adaptive_random_forest import ARFClassifier, ARFRegressor 6 | from .aggregated_mondrian_forest import AMFClassifier, AMFRegressor 7 | from .online_extra_trees import OXTRegressor 8 | 9 | __all__ = [ 10 | "ARFClassifier", 11 | "ARFRegressor", 12 | "AMFClassifier", 13 | "AMFRegressor", 14 | "OXTRegressor", 15 | ] 16 | -------------------------------------------------------------------------------- /river/forest/test_amf.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | 4 | def test_issue_1272(): 5 | """ 6 | 7 | https://github.com/online-ml/river/issues/1272 8 | 9 | >>> import river 10 | >>> from river import forest, metrics 11 | 12 | >>> model = forest.ARFClassifier(metric=metrics.CrossEntropy()) 13 | >>> model.learn_one({"x": 1}, True) 14 | 15 | >>> model = forest.ARFClassifier() 16 | >>> model.learn_one({"x": 1}, True) 17 | 18 | """ 19 | -------------------------------------------------------------------------------- /river/imblearn/__init__.py: -------------------------------------------------------------------------------- 1 | """Sampling methods.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .chebyshev import ChebyshevOverSampler, ChebyshevUnderSampler 6 | from .hard_sampling import HardSamplingClassifier, HardSamplingRegressor 7 | from .random import RandomOverSampler, RandomSampler, RandomUnderSampler 8 | 9 | __all__ = [ 10 | "ChebyshevOverSampler", 11 | "ChebyshevUnderSampler", 12 | "HardSamplingClassifier", 13 | "HardSamplingRegressor", 14 | "RandomOverSampler", 15 | "RandomUnderSampler", 16 | "RandomSampler", 17 | ] 18 | -------------------------------------------------------------------------------- /river/linear_model/__init__.py: -------------------------------------------------------------------------------- 1 | """Linear models.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import base 6 | from .alma import ALMAClassifier 7 | from .bayesian_lin_reg import BayesianLinearRegression 8 | from .lin_reg import LinearRegression 9 | from .log_reg import LogisticRegression 10 | from .pa import PAClassifier, PARegressor 11 | from .perceptron import Perceptron 12 | from .softmax import SoftmaxRegression 13 | 14 | __all__ = [ 15 | "base", 16 | "ALMAClassifier", 17 | "BayesianLinearRegression", 18 | "LinearRegression", 19 | "LogisticRegression", 20 | "PAClassifier", 21 | "PARegressor", 22 | "Perceptron", 23 | "SoftmaxRegression", 24 | ] 25 | -------------------------------------------------------------------------------- /river/metrics/accuracy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import metrics 4 | 5 | __all__ = ["Accuracy"] 6 | 7 | 8 | class Accuracy(metrics.base.MultiClassMetric): 9 | """Accuracy score, which is the percentage of exact matches. 10 | 11 | Parameters 12 | ---------- 13 | cm 14 | This parameter allows sharing the same confusion 15 | matrix between multiple metrics. Sharing a confusion matrix reduces the amount of storage 16 | and computation time. 17 | 18 | Examples 19 | -------- 20 | 21 | >>> from river import metrics 22 | 23 | >>> y_true = [True, False, True, True, True] 24 | >>> y_pred = [True, True, False, True, True] 25 | 26 | >>> metric = metrics.Accuracy() 27 | >>> for yt, yp in zip(y_true, y_pred): 28 | ... metric.update(yt, yp) 29 | 30 | >>> metric 31 | Accuracy: 60.00% 32 | 33 | """ 34 | 35 | def get(self): 36 | try: 37 | return self.cm.total_true_positives / self.cm.total_weight 38 | except ZeroDivisionError: 39 | return 0.0 40 | -------------------------------------------------------------------------------- /river/metrics/balanced_accuracy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import metrics 4 | 5 | __all__ = ["BalancedAccuracy"] 6 | 7 | 8 | class BalancedAccuracy(metrics.base.MultiClassMetric): 9 | """Balanced accuracy. 10 | 11 | Balanced accuracy is the average of recall obtained on each class. It is used to 12 | deal with imbalanced datasets in binary and multi-class classification problems. 13 | 14 | Parameters 15 | ---------- 16 | cm 17 | This parameter allows sharing the same confusion 18 | matrix between multiple metrics. Sharing a confusion matrix reduces the amount of storage 19 | and computation time. 20 | 21 | Examples 22 | -------- 23 | 24 | >>> from river import metrics 25 | >>> y_true = [True, False, True, True, False, True] 26 | >>> y_pred = [True, False, True, True, True, False] 27 | 28 | >>> metric = metrics.BalancedAccuracy() 29 | >>> for yt, yp in zip(y_true, y_pred): 30 | ... metric.update(yt, yp) 31 | 32 | >>> metric 33 | BalancedAccuracy: 62.50% 34 | 35 | >>> y_true = [0, 1, 0, 0, 1, 0] 36 | >>> y_pred = [0, 1, 0, 0, 0, 1] 37 | >>> metric = metrics.BalancedAccuracy() 38 | >>> for yt, yp in zip(y_true, y_pred): 39 | ... metric.update(yt, yp) 40 | 41 | >>> metric 42 | BalancedAccuracy: 62.50% 43 | 44 | """ 45 | 46 | def get(self): 47 | total = 0 48 | for c in self.cm.classes: 49 | try: 50 | total += self.cm[c][c] / self.cm.sum_row[c] 51 | except ZeroDivisionError: 52 | continue 53 | try: 54 | score = total / len(self.cm.classes) 55 | 56 | return score 57 | 58 | except ZeroDivisionError: 59 | return 0.0 60 | -------------------------------------------------------------------------------- /river/metrics/cross_entropy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | from river import metrics, utils 6 | 7 | __all__ = ["CrossEntropy"] 8 | 9 | 10 | class CrossEntropy(metrics.base.MeanMetric, metrics.base.MultiClassMetric): 11 | """Multiclass generalization of the logarithmic loss. 12 | 13 | Examples 14 | -------- 15 | 16 | >>> from river import metrics 17 | 18 | >>> y_true = [0, 1, 2, 2] 19 | >>> y_pred = [ 20 | ... {0: 0.29450637, 1: 0.34216758, 2: 0.36332605}, 21 | ... {0: 0.21290077, 1: 0.32728332, 2: 0.45981591}, 22 | ... {0: 0.42860913, 1: 0.33380113, 2: 0.23758974}, 23 | ... {0: 0.44941979, 1: 0.32962558, 2: 0.22095463} 24 | ... ] 25 | 26 | >>> metric = metrics.CrossEntropy() 27 | 28 | >>> for yt, yp in zip(y_true, y_pred): 29 | ... metric.update(yt, yp) 30 | ... print(metric.get()) 31 | 1.222454 32 | 1.169691 33 | 1.258864 34 | 1.321597 35 | 36 | >>> metric 37 | CrossEntropy: 1.321598 38 | 39 | """ 40 | 41 | _fmt = "" 42 | 43 | @property 44 | def bigger_is_better(self): 45 | return False 46 | 47 | @property 48 | def requires_labels(self): 49 | return False 50 | 51 | def _eval(self, y_true, y_pred): 52 | total = 0 53 | 54 | for label, proba in y_pred.items(): 55 | if y_true == label: 56 | total += math.log(utils.math.clamp(x=proba, minimum=1e-15, maximum=1 - 1e-15)) 57 | 58 | return -total 59 | -------------------------------------------------------------------------------- /river/metrics/efficient_rollingrocauc/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .efficient_rollingrocauc import EfficientRollingROCAUC 4 | 5 | __all__ = ["EfficientRollingROCAUC"] 6 | -------------------------------------------------------------------------------- /river/metrics/efficient_rollingrocauc/cpp/RollingROCAUC.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ROLLINGROCAUC_HPP 2 | #define ROLLINGROCAUC_HPP 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace rollingrocauc { 10 | 11 | class RollingROCAUC { 12 | public: 13 | RollingROCAUC(); 14 | RollingROCAUC(const int positiveLabel, const long unsigned windowSize); 15 | 16 | virtual ~RollingROCAUC() = default; 17 | 18 | // Calls insert() and removeLast if needed 19 | virtual void update(const int label, const double score); 20 | 21 | // Erase the most recent instance with content equal to params 22 | virtual void revert(const int label, const double score); 23 | 24 | // Calculates the ROCAUC and return it 25 | virtual double get() const; 26 | 27 | // Returns y_true as a vector 28 | virtual std::vector getTrueLabels() const; 29 | 30 | // Returns y_score as a vector 31 | virtual std::vector getScores() const; 32 | 33 | private: 34 | // Insert instance based on params 35 | virtual void insert(const int label, const double score); 36 | 37 | // Remove oldest instance 38 | virtual void removeLast(); 39 | 40 | int positiveLabel; 41 | 42 | std::size_t windowSize; 43 | std::size_t positives; 44 | 45 | // window maintains a queue of the instances to store the temporal 46 | // aspect of the stream. Using deque to allow revert() 47 | std::deque> window; 48 | 49 | // orderedWindow maintains a multiset (implemented as a tree) to store 50 | // the instances sorted 51 | std::multiset> orderedWindow; 52 | }; 53 | 54 | } // namespace rollingrocauc 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /river/metrics/efficient_rollingrocauc/efficient_rollingrocauc.pxd: -------------------------------------------------------------------------------- 1 | from libcpp.vector cimport vector 2 | 3 | cdef extern from "cpp/RollingROCAUC.cpp": 4 | pass 5 | 6 | cdef extern from "cpp/RollingROCAUC.hpp" namespace "rollingrocauc": 7 | cdef cppclass RollingROCAUC: 8 | RollingROCAUC(int positiveLabel, int windowSize) except + 9 | void update(int label, double score) 10 | void revert(int label, double score) 11 | double get() 12 | vector[int] getTrueLabels() 13 | vector[double] getScores() 14 | -------------------------------------------------------------------------------- /river/metrics/efficient_rollingrocauc/efficient_rollingrocauc.pyi: -------------------------------------------------------------------------------- 1 | from collections.abc import Sequence 2 | from typing import Any 3 | 4 | class EfficientRollingROCAUC: 5 | def __cinit__(self, positiveLabel: int, windowSize: int) -> None: ... 6 | def __dealloc__(self) -> None: ... 7 | def update(self, label: bool, score: bool | float | dict[bool, float]) -> None: ... 8 | def revert(self, label: bool, score: bool | float | dict[bool, float]) -> None: ... 9 | def get(self) -> float: ... 10 | def __getnewargs_ex__(self) -> tuple[tuple[int, int], dict[str, Any]]: ... 11 | def __getstate__(self) -> tuple[Sequence[int], Sequence[float]]: ... 12 | def __setstate__(self, state: tuple[Sequence[int], Sequence[float]]) -> None: ... 13 | -------------------------------------------------------------------------------- /river/metrics/expected_mutual_info.pyi: -------------------------------------------------------------------------------- 1 | from river import metrics 2 | 3 | def expected_mutual_info(confusion_matrix: metrics.ConfusionMatrix) -> float: ... 4 | -------------------------------------------------------------------------------- /river/metrics/log_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | from river import metrics 6 | 7 | __all__ = ["LogLoss"] 8 | 9 | 10 | class LogLoss(metrics.base.MeanMetric, metrics.base.BinaryMetric): 11 | """Binary logarithmic loss. 12 | 13 | Examples 14 | -------- 15 | 16 | >>> from river import metrics 17 | 18 | >>> y_true = [True, False, False, True] 19 | >>> y_pred = [0.9, 0.1, 0.2, 0.65] 20 | 21 | >>> metric = metrics.LogLoss() 22 | >>> for yt, yp in zip(y_true, y_pred): 23 | ... metric.update(yt, yp) 24 | ... print(metric.get()) 25 | 0.105360 26 | 0.105360 27 | 0.144621 28 | 0.216161 29 | 30 | >>> metric 31 | LogLoss: 0.216162 32 | 33 | """ 34 | 35 | _fmt = "" 36 | 37 | @property 38 | def bigger_is_better(self): 39 | return False 40 | 41 | @property 42 | def requires_labels(self): 43 | return False 44 | 45 | def _eval(self, y_true, y_pred): 46 | p_true = y_pred.get(True, 0.0) if isinstance(y_pred, dict) else y_pred 47 | p_true = self._clamp_proba(p_true) 48 | if y_true: 49 | return -math.log(p_true) 50 | return -math.log(1 - p_true) 51 | -------------------------------------------------------------------------------- /river/metrics/mae.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import metrics 4 | 5 | __all__ = ["MAE"] 6 | 7 | 8 | class MAE(metrics.base.MeanMetric, metrics.base.RegressionMetric): 9 | """Mean absolute error. 10 | 11 | Examples 12 | -------- 13 | 14 | >>> from river import metrics 15 | 16 | >>> y_true = [3, -0.5, 2, 7] 17 | >>> y_pred = [2.5, 0.0, 2, 8] 18 | 19 | >>> metric = metrics.MAE() 20 | 21 | >>> for yt, yp in zip(y_true, y_pred): 22 | ... metric.update(yt, yp) 23 | ... print(metric.get()) 24 | 0.5 25 | 0.5 26 | 0.333 27 | 0.5 28 | 29 | >>> metric 30 | MAE: 0.5 31 | 32 | """ 33 | 34 | def _eval(self, y_true, y_pred): 35 | return abs(y_true - y_pred) 36 | -------------------------------------------------------------------------------- /river/metrics/mape.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import metrics 4 | 5 | __all__ = ["MAPE"] 6 | 7 | 8 | class MAPE(metrics.base.MeanMetric, metrics.base.RegressionMetric): 9 | """Mean absolute percentage error. 10 | 11 | Examples 12 | -------- 13 | 14 | >>> from river import metrics 15 | 16 | >>> y_true = [3, -0.5, 2, 7] 17 | >>> y_pred = [2.5, 0.0, 2, 8] 18 | 19 | >>> metric = metrics.MAPE() 20 | >>> for yt, yp in zip(y_true, y_pred): 21 | ... metric.update(yt, yp) 22 | 23 | >>> metric 24 | MAPE: 32.738095 25 | 26 | """ 27 | 28 | def _eval(self, y_true, y_pred): 29 | if y_true == 0: 30 | return 0.0 31 | return abs(y_true - y_pred) / abs(y_true) 32 | 33 | def get(self): 34 | return 100 * super().get() 35 | -------------------------------------------------------------------------------- /river/metrics/mcc.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | from river import metrics 6 | 7 | __all__ = ["MCC"] 8 | 9 | 10 | class MCC(metrics.base.BinaryMetric): 11 | """Matthews correlation coefficient. 12 | 13 | Parameters 14 | ---------- 15 | cm 16 | This parameter allows sharing the same confusion matrix between multiple metrics. Sharing a 17 | confusion matrix reduces the amount of storage and computation time. 18 | pos_val 19 | Value to treat as "positive". 20 | 21 | Examples 22 | -------- 23 | 24 | >>> from river import metrics 25 | 26 | >>> y_true = [True, True, True, False] 27 | >>> y_pred = [True, False, True, True] 28 | 29 | >>> mcc = metrics.MCC() 30 | 31 | >>> for yt, yp in zip(y_true, y_pred): 32 | ... mcc.update(yt, yp) 33 | 34 | >>> mcc 35 | MCC: -0.333333 36 | 37 | References 38 | ---------- 39 | [^1]: [Wikipedia article](https://www.wikiwand.com/en/Matthews_correlation_coefficient) 40 | 41 | """ 42 | 43 | _fmt = "" 44 | 45 | def get(self): 46 | tp = self.cm.true_positives(self.pos_val) 47 | tn = self.cm.true_negatives(self.pos_val) 48 | fp = self.cm.false_positives(self.pos_val) 49 | fn = self.cm.false_negatives(self.pos_val) 50 | 51 | n = (tp + tn + fp + fn) or 1 52 | s = (tp + fn) / n 53 | p = (tp + fp) / n 54 | 55 | try: 56 | return (tp / n - s * p) / math.sqrt(p * s * (1 - s) * (1 - p)) 57 | except ZeroDivisionError: 58 | return 0.0 59 | -------------------------------------------------------------------------------- /river/metrics/multioutput/__init__.py: -------------------------------------------------------------------------------- 1 | """Metrics for multi-output learning.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import base 6 | from .confusion import MultiLabelConfusionMatrix 7 | from .exact_match import ExactMatch 8 | from .macro import MacroAverage 9 | from .micro import MicroAverage 10 | from .per_output import PerOutput 11 | from .sample_average import SampleAverage 12 | 13 | __all__ = [ 14 | "base", 15 | "MacroAverage", 16 | "MultiLabelConfusionMatrix", 17 | "ExactMatch", 18 | "MicroAverage", 19 | "PerOutput", 20 | "SampleAverage", 21 | ] 22 | -------------------------------------------------------------------------------- /river/metrics/multioutput/exact_match.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import metrics 4 | from river.metrics.multioutput.base import MultiOutputClassificationMetric 5 | 6 | __all__ = ["ExactMatch"] 7 | 8 | 9 | class ExactMatch(metrics.base.MeanMetric, MultiOutputClassificationMetric): 10 | """Exact match score. 11 | 12 | This is the most strict multi-label metric, defined as the number of 13 | samples that have all their labels correctly classified, divided by the 14 | total number of samples. 15 | 16 | Parameters 17 | ---------- 18 | cm 19 | This parameter allows sharing the same confusion matrix between multiple metrics. Sharing a 20 | confusion matrix reduces the amount of storage and computation time. 21 | 22 | Examples 23 | -------- 24 | 25 | >>> from river import metrics 26 | 27 | >>> y_true = [ 28 | ... {0: False, 1: True, 2: True}, 29 | ... {0: True, 1: True, 2: False}, 30 | ... {0: True, 1: True, 2: False}, 31 | ... ] 32 | 33 | >>> y_pred = [ 34 | ... {0: True, 1: True, 2: True}, 35 | ... {0: True, 1: False, 2: False}, 36 | ... {0: True, 1: True, 2: False}, 37 | ... ] 38 | 39 | >>> metric = metrics.multioutput.ExactMatch() 40 | >>> for yt, yp in zip(y_true, y_pred): 41 | ... metric.update(yt, yp) 42 | 43 | >>> metric 44 | ExactMatch: 33.33% 45 | 46 | """ 47 | 48 | def _eval(self, y_true, y_pred): 49 | return y_true == y_pred 50 | -------------------------------------------------------------------------------- /river/metrics/multioutput/macro.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import statistics 4 | from collections import defaultdict 5 | from copy import deepcopy 6 | from functools import partial 7 | 8 | from river import metrics, utils 9 | from river.metrics.multioutput.base import MultiOutputMetric 10 | 11 | __all__ = ["MacroAverage"] 12 | 13 | 14 | class MacroAverage(MultiOutputMetric, metrics.base.WrapperMetric): 15 | """Macro-average wrapper. 16 | 17 | A copy of the provided metric is made for each output. The arithmetic average of all the 18 | metrics is returned. 19 | 20 | Parameters 21 | ---------- 22 | metric 23 | A classification or a regression metric. 24 | 25 | """ 26 | 27 | def __init__(self, metric): 28 | self._metric = metric 29 | self.metrics = defaultdict(partial(deepcopy, self._metric)) 30 | 31 | @property 32 | def metric(self): 33 | return self._metric 34 | 35 | def works_with(self, model) -> bool: 36 | if isinstance(self.metric, metrics.base.ClassificationMetric): 37 | return utils.inspect.ismoclassifier(model) 38 | return utils.inspect.ismoregressor(model) 39 | 40 | def update(self, y_true, y_pred, w=1.0): 41 | for i in y_true: 42 | self.metrics[i].update(y_true[i], y_pred[i], w) 43 | 44 | def revert(self, y_true, y_pred, w=1.0): 45 | for i in y_true: 46 | self.metrics[i].revert(y_true[i], y_pred[i], w) 47 | 48 | def get(self): 49 | return statistics.mean(metric.get() for metric in self.metrics.values()) 50 | -------------------------------------------------------------------------------- /river/metrics/multioutput/micro.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import metrics, utils 4 | from river.metrics.multioutput.base import MultiOutputMetric 5 | 6 | __all__ = ["MicroAverage"] 7 | 8 | 9 | class MicroAverage(MultiOutputMetric, metrics.base.WrapperMetric): 10 | """Micro-average wrapper. 11 | 12 | The provided metric is updated with the value of each output. 13 | 14 | Parameters 15 | ---------- 16 | metric 17 | A classification or a regression metric. 18 | 19 | """ 20 | 21 | def __init__(self, metric): 22 | self._metric = metric 23 | 24 | @property 25 | def metric(self): 26 | return self._metric 27 | 28 | def works_with(self, model) -> bool: 29 | if isinstance(self.metric, metrics.base.ClassificationMetric): 30 | return utils.inspect.ismoclassifier(model) 31 | return utils.inspect.ismoregressor(model) 32 | 33 | def update(self, y_true, y_pred, w=1.0): 34 | for i in y_true: 35 | self.metric.update(y_true[i], y_pred[i], w) 36 | 37 | def revert(self, y_true, y_pred, w=1.0): 38 | for i in y_true: 39 | self.metric.revert(y_true[i], y_pred[i], w) 40 | 41 | def get(self): 42 | return self.metric.get() 43 | -------------------------------------------------------------------------------- /river/metrics/multioutput/per_output.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import defaultdict 4 | from copy import deepcopy 5 | from functools import partial 6 | 7 | from river import metrics, utils 8 | from river.metrics.multioutput.base import MultiOutputMetric 9 | 10 | __all__ = ["PerOutput"] 11 | 12 | 13 | class PerOutput(MultiOutputMetric, metrics.base.WrapperMetric): 14 | """Per-output wrapper. 15 | 16 | A copy of the metric is maintained for each output. 17 | 18 | Parameters 19 | ---------- 20 | metric 21 | A classification or a regression metric. 22 | 23 | """ 24 | 25 | def __init__(self, metric): 26 | self._metric = metric 27 | self.metrics = defaultdict(partial(deepcopy, self._metric)) 28 | 29 | @property 30 | def metric(self): 31 | return self._metric 32 | 33 | def works_with(self, model) -> bool: 34 | if isinstance(self.metric, metrics.base.ClassificationMetric): 35 | return utils.inspect.ismoclassifier(model) 36 | return utils.inspect.ismoregressor(model) 37 | 38 | def update(self, y_true, y_pred, w=1.0): 39 | for i in y_true: 40 | self.metrics[i].update(y_true[i], y_pred[i], w) 41 | 42 | def revert(self, y_true, y_pred, w=1.0): 43 | for i in y_true: 44 | self.metrics[i].revert(y_true[i], y_pred[i], w) 45 | 46 | def get(self): 47 | return dict(self.metrics) 48 | 49 | def __repr__(self): 50 | return "\n".join(f"{i} - {metric}" for i, metric in self.metrics.items()) 51 | -------------------------------------------------------------------------------- /river/metrics/smape.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import metrics 4 | 5 | __all__ = ["SMAPE"] 6 | 7 | 8 | class SMAPE(metrics.base.MeanMetric, metrics.base.RegressionMetric): 9 | """Symmetric mean absolute percentage error. 10 | 11 | Examples 12 | -------- 13 | 14 | >>> from river import metrics 15 | 16 | >>> y_true = [0, 0.07533, 0.07533, 0.07533, 0.07533, 0.07533, 0.07533, 0.0672, 0.0672] 17 | >>> y_pred = [0, 0.102, 0.107, 0.047, 0.1, 0.032, 0.047, 0.108, 0.089] 18 | 19 | >>> metric = metrics.SMAPE() 20 | >>> for yt, yp in zip(y_true, y_pred): 21 | ... metric.update(yt, yp) 22 | 23 | >>> metric 24 | SMAPE: 37.869392 25 | 26 | """ 27 | 28 | def _eval(self, y_true, y_pred): 29 | den = abs(y_true) + abs(y_pred) 30 | if den == 0: 31 | return 0.0 32 | return 2.0 * abs(y_true - y_pred) / den 33 | 34 | def get(self): 35 | return 100 * super().get() 36 | -------------------------------------------------------------------------------- /river/metrics/test_confusion.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, evaluate, linear_model, metrics, optim, preprocessing 4 | 5 | 6 | def test_issue_1443(): 7 | dataset = datasets.Phishing() 8 | 9 | model = preprocessing.StandardScaler() | linear_model.LogisticRegression( 10 | optimizer=optim.SGD(0.1) 11 | ) 12 | 13 | metric = metrics.ConfusionMatrix() 14 | 15 | for _ in evaluate.iter_progressive_val_score(dataset, model, metric): 16 | pass 17 | 18 | 19 | def test_confusion_and_other_metrics(): 20 | """ 21 | 22 | >>> dataset = datasets.Phishing() 23 | 24 | >>> model = preprocessing.StandardScaler() | linear_model.LogisticRegression( 25 | ... optimizer=optim.SGD(0.1) 26 | ... ) 27 | 28 | >>> metric = metrics.ConfusionMatrix() + metrics.F1() + metrics.Accuracy() 29 | 30 | >>> evaluate.progressive_val_score(dataset, model, metric) 31 | False True 32 | False 613 89 33 | True 49 499 34 | F1: 87.85% 35 | Accuracy: 88.96% 36 | 37 | """ 38 | -------------------------------------------------------------------------------- /river/metrics/test_cross_entropy.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | from sklearn import metrics as sk_metrics 6 | 7 | from river import metrics 8 | 9 | 10 | def test_cross_entropy(): 11 | metric = metrics.CrossEntropy() 12 | 13 | y_true = [0, 1, 2, 2] 14 | y_pred = [ 15 | [0.29450637, 0.34216758, 0.36332605], 16 | [0.21290077, 0.32728332, 0.45981591], 17 | [0.42860913, 0.33380113, 0.23758974], 18 | [0.44941979, 0.32962558, 0.22095463], 19 | ] 20 | 21 | for i, (yt, yp) in enumerate(zip(y_true, y_pred)): 22 | yp = dict(enumerate(yp)) 23 | metric.update(yt, yp) 24 | 25 | if i >= 1: 26 | assert math.isclose( 27 | metric.get(), 28 | sk_metrics.log_loss(y_true[: i + 1], y_pred[: i + 1], labels=[0, 1, 2]), 29 | ) 30 | 31 | metric.revert(y_true[-1], dict(enumerate(y_pred[-1]))) 32 | assert math.isclose(metric.get(), sk_metrics.log_loss(y_true[:-1], y_pred[:-1])) 33 | -------------------------------------------------------------------------------- /river/metrics/test_log_loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | from sklearn import metrics as sk_metrics 6 | 7 | from river import metrics 8 | 9 | 10 | def test_log_loss(): 11 | metric = metrics.LogLoss() 12 | 13 | y_true = [True, False, False, True] 14 | y_pred = [0.9, 0.1, 0.2, 0.65] 15 | 16 | for i, (yt, yp) in enumerate(zip(y_true, y_pred)): 17 | metric.update(yt, yp) 18 | 19 | if i >= 1: 20 | assert math.isclose(metric.get(), sk_metrics.log_loss(y_true[: i + 1], y_pred[: i + 1])) 21 | 22 | metric.revert(y_true[-1], y_pred[-1]) 23 | assert math.isclose(metric.get(), sk_metrics.log_loss(y_true[:-1], y_pred[:-1])) 24 | -------------------------------------------------------------------------------- /river/misc/__init__.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous. 2 | 3 | This module essentially regroups some implementations that have nowhere else to go. 4 | 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from .sdft import SDFT 10 | from .skyline import Skyline 11 | 12 | __all__ = ["SDFT", "Skyline"] 13 | -------------------------------------------------------------------------------- /river/model_selection/__init__.py: -------------------------------------------------------------------------------- 1 | """Model selection. 2 | 3 | This module regroups a variety of methods that may be used for performing model selection. An 4 | model selector is provided with a list of models. These are called "experts" in the expert learning 5 | literature. The model selector's goal is to perform at least as well as the best model. Indeed, 6 | initially, the best model is not known. The performance of each model becomes more apparent as time 7 | goes by. Different strategies are possible, each one offering a different tradeoff in terms of 8 | accuracy and computational performance. 9 | 10 | Model selection can be used for tuning the hyperparameters of a model. This may be done by creating 11 | a copy of the model for each set of hyperparameters, and treating each copy as a separate model. 12 | The `utils.expand_param_grid` function can be used for this purpose. 13 | 14 | """ 15 | 16 | from __future__ import annotations 17 | 18 | from . import base 19 | from .bandit import BanditClassifier, BanditRegressor 20 | from .greedy import GreedyRegressor 21 | from .sh import SuccessiveHalvingClassifier, SuccessiveHalvingRegressor 22 | 23 | __all__ = [ 24 | "base", 25 | "BanditClassifier", 26 | "BanditRegressor", 27 | "GreedyRegressor", 28 | "SuccessiveHalvingClassifier", 29 | "SuccessiveHalvingRegressor", 30 | ] 31 | -------------------------------------------------------------------------------- /river/multiclass/__init__.py: -------------------------------------------------------------------------------- 1 | """Multi-class classification.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .occ import OutputCodeClassifier 6 | from .ovo import OneVsOneClassifier 7 | from .ovr import OneVsRestClassifier 8 | 9 | __all__ = ["OutputCodeClassifier", "OneVsOneClassifier", "OneVsRestClassifier"] 10 | -------------------------------------------------------------------------------- /river/multiclass/test_ovr.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | 5 | from river import datasets, linear_model, metrics, multiclass, preprocessing, stream 6 | 7 | 8 | def test_online_batch_consistent(): 9 | # Batch 10 | 11 | batch = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( 12 | linear_model.LogisticRegression() 13 | ) 14 | 15 | dataset = datasets.ImageSegments() 16 | 17 | batch_metric = metrics.MacroF1() 18 | 19 | for i, x in enumerate(pd.read_csv(dataset.path, chunksize=1)): 20 | y = x.pop("category") 21 | y_pred = batch.predict_many(x) 22 | batch.learn_many(x, y) 23 | 24 | for yt, yp in zip(y, y_pred): 25 | if yp is not None: 26 | batch_metric.update(yt, yp) 27 | 28 | if i == 30: 29 | break 30 | 31 | # Online 32 | 33 | online = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier( 34 | linear_model.LogisticRegression() 35 | ) 36 | 37 | online_metric = metrics.MacroF1() 38 | 39 | X = pd.read_csv(dataset.path) 40 | Y = X.pop("category") 41 | 42 | for i, (x, y) in enumerate(stream.iter_pandas(X, Y)): 43 | y_pred = online.predict_one(x) 44 | online.learn_one(x, y) 45 | 46 | if y_pred is not None: 47 | online_metric.update(y, y_pred) 48 | 49 | if i == 30: 50 | break 51 | 52 | assert online_metric.get() == batch_metric.get() 53 | -------------------------------------------------------------------------------- /river/multioutput/__init__.py: -------------------------------------------------------------------------------- 1 | """Multi-output models.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .chain import ( 6 | ClassifierChain, 7 | MonteCarloClassifierChain, 8 | ProbabilisticClassifierChain, 9 | RegressorChain, 10 | ) 11 | from .encoder import MultiClassEncoder 12 | 13 | __all__ = [ 14 | "ClassifierChain", 15 | "MonteCarloClassifierChain", 16 | "MultiClassEncoder", 17 | "ProbabilisticClassifierChain", 18 | "RegressorChain", 19 | ] 20 | -------------------------------------------------------------------------------- /river/naive_bayes/__init__.py: -------------------------------------------------------------------------------- 1 | """Naive Bayes algorithms.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .bernoulli import BernoulliNB 6 | from .complement import ComplementNB 7 | from .gaussian import GaussianNB 8 | from .multinomial import MultinomialNB 9 | 10 | __all__ = ["BernoulliNB", "ComplementNB", "GaussianNB", "MultinomialNB"] 11 | -------------------------------------------------------------------------------- /river/neighbors/__init__.py: -------------------------------------------------------------------------------- 1 | """Neighbors-based learning. 2 | 3 | Also known as *lazy* methods. In these methods, generalisation of the training data is delayed 4 | until a query is received. 5 | 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from .ann import SWINN 11 | from .knn_classifier import KNNClassifier 12 | from .knn_regressor import KNNRegressor 13 | from .lazy import LazySearch 14 | 15 | __all__ = [ 16 | "LazySearch", 17 | "KNNClassifier", 18 | "KNNRegressor", 19 | "SWINN", 20 | ] 21 | -------------------------------------------------------------------------------- /river/neighbors/ann/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .swinn import SWINN 4 | 5 | __all__ = ["SWINN"] 6 | -------------------------------------------------------------------------------- /river/neighbors/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | import typing 5 | 6 | from river import base 7 | 8 | 9 | class DistanceFunc(typing.Protocol): 10 | def __call__(self, a: typing.Any, b: typing.Any, **kwargs) -> float: ... 11 | 12 | 13 | class FunctionWrapper: 14 | """Wrapper used to make distance function work with KNNClassifier and 15 | KNNRegressor. 16 | 17 | The k-NN-based classifier and regressor store tuples with `(x, y)`, but only 18 | `x` is used for distance calculations. This wrapper makes sure `x` is accessed 19 | when calculating the distances. 20 | 21 | Parameters 22 | ---------- 23 | distance_function 24 | The custom distance function to be wrapped. 25 | """ 26 | 27 | def __init__(self, distance_function: DistanceFunc): 28 | self.distance_function = distance_function 29 | 30 | def __call__(self, a, b): 31 | # Access x, which is stored in a tuple (x, y) 32 | return self.distance_function(a[0], b[0]) 33 | 34 | 35 | class BaseNN(base.Estimator, abc.ABC): 36 | def __init__(self, dist_func: DistanceFunc | FunctionWrapper): 37 | self.dist_func = dist_func 38 | 39 | @abc.abstractmethod 40 | def append(self, item: typing.Any, **kwargs) -> None: 41 | pass 42 | 43 | @abc.abstractmethod 44 | def search(self, item: typing.Any, n_neighbors: int, **kwargs) -> tuple[list, list]: 45 | pass 46 | -------------------------------------------------------------------------------- /river/neural_net/__init__.py: -------------------------------------------------------------------------------- 1 | """Neural networks.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import activations 6 | from .mlp import MLPRegressor 7 | 8 | __all__ = ["activations", "MLPRegressor"] 9 | -------------------------------------------------------------------------------- /river/optim/__init__.py: -------------------------------------------------------------------------------- 1 | """Stochastic optimization.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import base, initializers, losses, schedulers 6 | from .ada_bound import AdaBound 7 | from .ada_delta import AdaDelta 8 | from .ada_grad import AdaGrad 9 | from .ada_max import AdaMax 10 | from .adam import Adam 11 | from .ams_grad import AMSGrad 12 | from .average import Averager 13 | from .ftrl import FTRLProximal 14 | from .momentum import Momentum 15 | from .nadam import Nadam 16 | from .nesterov import NesterovMomentum 17 | from .rms_prop import RMSProp 18 | from .sgd import SGD 19 | 20 | __all__ = [ 21 | "base", 22 | "AdaBound", 23 | "AdaDelta", 24 | "AdaGrad", 25 | "Adam", 26 | "AMSGrad", 27 | "AdaMax", 28 | "Averager", 29 | "FTRLProximal", 30 | "initializers", 31 | "losses", 32 | "Momentum", 33 | "Nadam", 34 | "NesterovMomentum", 35 | "Optimizer", 36 | "RMSProp", 37 | "schedulers", 38 | "SGD", 39 | ] 40 | -------------------------------------------------------------------------------- /river/optim/ada_grad.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import collections 4 | 5 | from river import optim 6 | 7 | __all__ = ["AdaGrad"] 8 | 9 | 10 | class AdaGrad(optim.base.Optimizer): 11 | """AdaGrad optimizer. 12 | 13 | Parameters 14 | ---------- 15 | lr 16 | eps 17 | 18 | Attributes 19 | ---------- 20 | g2 : collections.defaultdict 21 | 22 | Examples 23 | -------- 24 | 25 | >>> from river import datasets 26 | >>> from river import evaluate 27 | >>> from river import linear_model 28 | >>> from river import metrics 29 | >>> from river import optim 30 | >>> from river import preprocessing 31 | 32 | >>> dataset = datasets.Phishing() 33 | >>> optimizer = optim.AdaGrad() 34 | >>> model = ( 35 | ... preprocessing.StandardScaler() | 36 | ... linear_model.LogisticRegression(optimizer) 37 | ... ) 38 | >>> metric = metrics.F1() 39 | 40 | >>> evaluate.progressive_val_score(dataset, model, metric) 41 | F1: 88.01% 42 | 43 | References 44 | ---------- 45 | [^1]: [Duchi, J., Hazan, E. and Singer, Y., 2011. Adaptive subgradient methods for online learning and stochastic optimization. Journal of machine learning research, 12(Jul), pp.2121-2159.](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) 46 | 47 | """ 48 | 49 | def __init__(self, lr=0.1, eps=1e-8): 50 | super().__init__(lr) 51 | self.eps = eps 52 | self.g2 = collections.defaultdict(float) 53 | 54 | def _step_with_dict(self, w, g): 55 | for i, gi in g.items(): 56 | self.g2[i] += gi**2 57 | w[i] -= self.learning_rate / (self.g2[i] + self.eps) ** 0.5 * gi 58 | 59 | return w 60 | -------------------------------------------------------------------------------- /river/optim/momentum.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import collections 4 | 5 | from river import optim 6 | 7 | __all__ = ["Momentum"] 8 | 9 | 10 | class Momentum(optim.base.Optimizer): 11 | """Momentum optimizer. 12 | 13 | Parameters 14 | ---------- 15 | lr 16 | rho 17 | 18 | Examples 19 | -------- 20 | 21 | >>> from river import datasets 22 | >>> from river import evaluate 23 | >>> from river import linear_model 24 | >>> from river import metrics 25 | >>> from river import optim 26 | >>> from river import preprocessing 27 | 28 | >>> dataset = datasets.Phishing() 29 | >>> optimizer = optim.Momentum() 30 | >>> model = ( 31 | ... preprocessing.StandardScaler() | 32 | ... linear_model.LogisticRegression(optimizer) 33 | ... ) 34 | >>> metric = metrics.F1() 35 | 36 | >>> evaluate.progressive_val_score(dataset, model, metric) 37 | F1: 84.09% 38 | 39 | """ 40 | 41 | def __init__(self, lr=0.1, rho=0.9): 42 | super().__init__(lr) 43 | self.rho = rho 44 | self.s = collections.defaultdict(float) 45 | 46 | def _step_with_dict(self, w, g): 47 | for i, gi in g.items(): 48 | self.s[i] = self.rho * self.s[i] + self.learning_rate * gi 49 | w[i] -= self.s[i] 50 | 51 | return w 52 | -------------------------------------------------------------------------------- /river/optim/nesterov.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import collections 4 | 5 | from river import optim 6 | 7 | __all__ = ["NesterovMomentum"] 8 | 9 | 10 | class NesterovMomentum(optim.base.Optimizer): 11 | """Nesterov Momentum optimizer. 12 | 13 | Parameters 14 | ---------- 15 | lr 16 | rho 17 | 18 | Examples 19 | -------- 20 | 21 | >>> from river import datasets 22 | >>> from river import evaluate 23 | >>> from river import linear_model 24 | >>> from river import metrics 25 | >>> from river import optim 26 | >>> from river import preprocessing 27 | 28 | >>> dataset = datasets.Phishing() 29 | >>> optimizer = optim.NesterovMomentum() 30 | >>> model = ( 31 | ... preprocessing.StandardScaler() | 32 | ... linear_model.LogisticRegression(optimizer) 33 | ... ) 34 | >>> metric = metrics.F1() 35 | 36 | >>> evaluate.progressive_val_score(dataset, model, metric) 37 | F1: 84.22% 38 | 39 | """ 40 | 41 | def __init__(self, lr=0.1, rho=0.9): 42 | super().__init__(lr) 43 | self.rho = rho 44 | self.s = collections.defaultdict(float) 45 | 46 | def look_ahead(self, w): 47 | for i in w: 48 | w[i] -= self.rho * self.s[i] 49 | 50 | return w 51 | 52 | def _step_with_dict(self, w, g): 53 | # Move w back to it's initial position 54 | for i in w: 55 | w[i] += self.rho * self.s[i] 56 | 57 | for i, gi in g.items(): 58 | self.s[i] = self.rho * self.s[i] + self.learning_rate * gi 59 | w[i] -= self.s[i] 60 | 61 | return w 62 | -------------------------------------------------------------------------------- /river/optim/sgd.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import optim 4 | 5 | __all__ = ["SGD"] 6 | 7 | 8 | class SGD(optim.base.Optimizer): 9 | """Plain stochastic gradient descent. 10 | 11 | Parameters 12 | ---------- 13 | lr 14 | 15 | Examples 16 | -------- 17 | 18 | >>> from river import datasets 19 | >>> from river import evaluate 20 | >>> from river import linear_model 21 | >>> from river import metrics 22 | >>> from river import optim 23 | >>> from river import preprocessing 24 | 25 | >>> dataset = datasets.Phishing() 26 | >>> optimizer = optim.SGD(0.1) 27 | >>> model = ( 28 | ... preprocessing.StandardScaler() | 29 | ... linear_model.LogisticRegression(optimizer) 30 | ... ) 31 | >>> metric = metrics.F1() 32 | 33 | >>> evaluate.progressive_val_score(dataset, model, metric) 34 | F1: 87.85% 35 | 36 | References 37 | ---------- 38 | [^1]: [Robbins, H. and Monro, S., 1951. A stochastic approximation method. The annals of mathematical statistics, pp.400-407](https://pdfs.semanticscholar.org/34dd/d8865569c2c32dec9bf7ffc817ff42faaa01.pdf) 39 | 40 | """ 41 | 42 | def __init__(self, lr=0.01) -> None: 43 | super().__init__(lr) 44 | 45 | def _step_with_dict(self, w, g): 46 | for i, gi in g.items(): 47 | w[i] -= self.learning_rate * gi 48 | return w 49 | 50 | def _step_with_vector(self, w, g): 51 | w -= self.learning_rate * g 52 | return w 53 | -------------------------------------------------------------------------------- /river/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """Feature preprocessing. 2 | 3 | The purpose of this module is to modify an existing set of features so that they can be processed 4 | by a machine learning algorithm. This may be done by scaling numeric parts of the data or by 5 | one-hot encoding categorical features. The difference with the `feature_extraction` module is that 6 | the latter extracts new information from the data 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from .feature_hasher import FeatureHasher 13 | from .impute import PreviousImputer, StatImputer 14 | from .lda import LDA 15 | from .one_hot import OneHotEncoder 16 | from .ordinal import OrdinalEncoder 17 | from .pred_clipper import PredClipper 18 | from .random_projection import GaussianRandomProjector, SparseRandomProjector 19 | from .scale import ( 20 | AdaptiveStandardScaler, 21 | Binarizer, 22 | MaxAbsScaler, 23 | MinMaxScaler, 24 | Normalizer, 25 | RobustScaler, 26 | StandardScaler, 27 | ) 28 | from .scale_target import TargetMinMaxScaler, TargetStandardScaler 29 | 30 | __all__ = [ 31 | "AdaptiveStandardScaler", 32 | "Binarizer", 33 | "FeatureHasher", 34 | "GaussianRandomProjector", 35 | "LDA", 36 | "MaxAbsScaler", 37 | "MinMaxScaler", 38 | "Normalizer", 39 | "OneHotEncoder", 40 | "OrdinalEncoder", 41 | "PredClipper", 42 | "PreviousImputer", 43 | "RobustScaler", 44 | "SparseRandomProjector", 45 | "StandardScaler", 46 | "StatImputer", 47 | "TargetMinMaxScaler", 48 | "TargetStandardScaler", 49 | ] 50 | -------------------------------------------------------------------------------- /river/preprocessing/test_random_projection.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import numpy as np 4 | 5 | from river import datasets, preprocessing 6 | 7 | 8 | def test_gaussian_random_projector_dot_product(): 9 | dataset = datasets.TrumpApproval() 10 | projector = preprocessing.GaussianRandomProjector(n_components=3) 11 | 12 | for x, y in dataset: 13 | y = projector.transform_one(x) 14 | y_arr = np.array(list(y.values())) 15 | x_arr = np.array(list(x.values())) 16 | P = np.array( 17 | [[projector._projection_matrix[i, j] for j in x] for i in range(projector.n_components)] 18 | ) 19 | np.testing.assert_allclose(x_arr @ P.T, y_arr) 20 | 21 | 22 | def test_sparse_random_projector_dot_product(): 23 | dataset = datasets.TrumpApproval() 24 | projector = preprocessing.SparseRandomProjector(n_components=3, density=0.5) 25 | 26 | for x, y in dataset: 27 | y = projector.transform_one(x) 28 | y_arr = np.array(list(y.values())) 29 | x_arr = np.array(list(x.values())) 30 | P = np.array( 31 | [ 32 | [projector._projection_matrix[j].get(i, 0) for j in x] 33 | for i in range(projector.n_components) 34 | ] 35 | ) 36 | np.testing.assert_allclose(x_arr @ P.T, y_arr) 37 | 38 | 39 | def test_sparse_random_projector_size(): 40 | dataset = datasets.TrumpApproval() 41 | projector = preprocessing.SparseRandomProjector(n_components=3, density=0.5) 42 | 43 | for x, y in dataset: 44 | projector.transform_one(x) 45 | break 46 | 47 | n_weights = sum(len(v) for v in projector._projection_matrix.values()) 48 | assert n_weights < len(x) * projector.n_components 49 | -------------------------------------------------------------------------------- /river/proba/__init__.py: -------------------------------------------------------------------------------- 1 | """Probability distributions.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import base 6 | from .beta import Beta 7 | from .gaussian import Gaussian, MultivariateGaussian 8 | from .multinomial import Multinomial 9 | 10 | __all__ = ["base", "Beta", "Gaussian", "Multinomial", "MultivariateGaussian"] 11 | -------------------------------------------------------------------------------- /river/proba/test_gaussian.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | 9 | from river import proba 10 | 11 | 12 | @pytest.mark.parametrize( 13 | "p", 14 | [ 15 | pytest.param( 16 | p, 17 | id=f"{p=}", 18 | ) 19 | for p in [1, 3, 5] 20 | ], 21 | ) 22 | def test_univariate_multivariate_consistency(p): 23 | X = pd.DataFrame(np.random.random((30, p)), columns=range(p)) 24 | 25 | multi = proba.MultivariateGaussian() 26 | single = {c: proba.Gaussian() for c in X.columns} 27 | 28 | for x in X.to_dict(orient="records"): 29 | multi.update(x) 30 | for c, s in single.items(): 31 | s.update(x[c]) 32 | 33 | for c in X.columns: 34 | assert math.isclose(multi.mu[c], single[c].mu) 35 | assert math.isclose(multi.sigma[c][c], single[c].sigma) 36 | -------------------------------------------------------------------------------- /river/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/river/py.typed -------------------------------------------------------------------------------- /river/reco/__init__.py: -------------------------------------------------------------------------------- 1 | """Recommender systems module. 2 | 3 | Recommender systems (recsys for short) is a large topic. This module is far from comprehensive. It 4 | simply provides models which can contribute towards building a recommender system. 5 | 6 | A typical recommender system is made up of a retrieval phase, followed by a ranking phase. The 7 | output of the retrieval phase is a shortlist of the catalogue of items. The items in the shortlist 8 | are then usually ranked according to the expected preference the user will have for each item. This 9 | module focuses on the ranking phase. 10 | 11 | Models which inherit from the `Ranker` class have a `rank` method. This allows sorting a set of 12 | items for a given user. Each model also has a `learn_one(user, item, y, context)` which allows 13 | learning user preferences. The `y` parameter is a reward value, the nature of which depends is 14 | specific to each and every recommendation task. Typically the reward is a number or a boolean 15 | value. It is up to the user to determine how to translate a user session into training data. 16 | 17 | """ 18 | 19 | from __future__ import annotations 20 | 21 | from . import base 22 | from .baseline import Baseline 23 | from .biased_mf import BiasedMF 24 | from .funk_mf import FunkMF 25 | from .normal import RandomNormal 26 | 27 | __all__ = ["base", "Baseline", "BiasedMF", "FunkMF", "RandomNormal"] 28 | -------------------------------------------------------------------------------- /river/rules/__init__.py: -------------------------------------------------------------------------------- 1 | """Decision rules-based algorithms.""" 2 | 3 | from __future__ import annotations 4 | 5 | from .amrules import AMRules 6 | 7 | __all__ = ["AMRules"] 8 | -------------------------------------------------------------------------------- /river/sketch/__init__.py: -------------------------------------------------------------------------------- 1 | """Data containers and collections for sequential data. 2 | 3 | This module has summary and sketch structures that operate with constrained amounts 4 | of memory and processing time. 5 | 6 | """ 7 | 8 | from __future__ import annotations 9 | 10 | from .counter import Counter 11 | from .heavy_hitters import HeavyHitters 12 | from .histogram import Histogram 13 | from .set import Set 14 | 15 | __all__ = ["Counter", "HeavyHitters", "Histogram", "Set"] 16 | -------------------------------------------------------------------------------- /river/stats/__init__.py: -------------------------------------------------------------------------------- 1 | """Running statistics""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import base 6 | from .auto_corr import AutoCorr 7 | from .count import Count 8 | from .cov import Cov 9 | from .entropy import Entropy 10 | from .ewmean import EWMean 11 | from .ewvar import EWVar 12 | from .iqr import IQR, RollingIQR 13 | from .kolmogorov_smirnov import KolmogorovSmirnov 14 | from .kurtosis import Kurtosis 15 | from .link import Link 16 | from .mad import MAD 17 | from .maximum import AbsMax, Max, RollingAbsMax, RollingMax 18 | from .mean import BayesianMean, Mean 19 | from .minimum import Min, RollingMin 20 | from .mode import Mode, RollingMode 21 | from .n_unique import NUnique 22 | from .pearson import PearsonCorr 23 | from .ptp import PeakToPeak, RollingPeakToPeak 24 | from .quantile import Quantile, RollingQuantile 25 | from .sem import SEM 26 | from .shift import Shift 27 | from .skew import Skew 28 | from .summing import Sum 29 | from .var import Var 30 | 31 | __all__ = [ 32 | "base", 33 | "AbsMax", 34 | "AutoCorr", 35 | "BayesianMean", 36 | "Count", 37 | "Cov", 38 | "Entropy", 39 | "EWMean", 40 | "EWVar", 41 | "IQR", 42 | "KolmogorovSmirnov", 43 | "Kurtosis", 44 | "Link", 45 | "MAD", 46 | "Max", 47 | "Mean", 48 | "Min", 49 | "Mode", 50 | "NUnique", 51 | "PeakToPeak", 52 | "PearsonCorr", 53 | "Quantile", 54 | "RollingAbsMax", 55 | "RollingIQR", 56 | "RollingMax", 57 | "RollingMin", 58 | "RollingMode", 59 | "RollingPeakToPeak", 60 | "RollingQuantile", 61 | "SEM", 62 | "Shift", 63 | "Skew", 64 | "Sum", 65 | "Var", 66 | ] 67 | -------------------------------------------------------------------------------- /river/stats/_rust_stats.pyi: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | class RsQuantile: 4 | def __init__(self, q: float): ... 5 | def update(self, x: float): ... 6 | def get(self) -> float: ... 7 | 8 | class RsEWMean: 9 | def __init__(self, alpha: float): ... 10 | def update(self, x: float): ... 11 | def get(self) -> float: ... 12 | 13 | class RsEWVar: 14 | def __init__(self, alpha: float): ... 15 | def update(self, x: float): ... 16 | def get(self) -> float: ... 17 | 18 | class RsIQR: 19 | def __init__(self, q_inf: float, q_sup: float): ... 20 | def update(self, x: float): ... 21 | def get(self) -> float: ... 22 | 23 | class RsKurtosis: 24 | def __init__(self, bias: bool): ... 25 | def update(self, x: float): ... 26 | def get(self) -> float: ... 27 | 28 | class RsPeakToPeak: 29 | def __init__(self): ... 30 | def update(self, x: float): ... 31 | def get(self) -> float: ... 32 | 33 | class RsSkew: 34 | def __init__(self, bias: float): ... 35 | def update(self, x: float): ... 36 | def get(self) -> float: ... 37 | 38 | class RsRollingQuantile: 39 | def __init__(self, q: float, window_size: int): ... 40 | def update(self, x: float): ... 41 | def get(self) -> float: ... 42 | 43 | class RsRollingIQR: 44 | def __init__(self, q_inf: float, q_sup: float, window_size: int): ... 45 | def update(self, x: float): ... 46 | def get(self) -> float: ... 47 | -------------------------------------------------------------------------------- /river/stats/count.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stats 4 | 5 | 6 | class Count(stats.base.Univariate): 7 | """A simple counter. 8 | 9 | Attributes 10 | ---------- 11 | n : int 12 | The current number of observations. 13 | 14 | """ 15 | 16 | def __init__(self): 17 | self.n = 0 18 | 19 | def update(self, x=None): 20 | self.n += 1 21 | 22 | def get(self): 23 | return self.n 24 | -------------------------------------------------------------------------------- /river/stats/mad.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from . import quantile 4 | 5 | 6 | class MAD(quantile.Quantile): 7 | """Median Absolute Deviation (MAD). 8 | 9 | The median absolute deviation is the median of the absolute differences between each data point 10 | and the data's overall median. In an online setting, the median of the data is unknown 11 | beforehand. Therefore, both the median of the data and the median of the differences of the 12 | data with respect to the latter are updated online. To be precise, the median of the data is 13 | updated before the median of the differences. As a consequence, this online version of the MAD 14 | does not coincide exactly with its batch counterpart. 15 | 16 | Examples 17 | -------- 18 | 19 | >>> from river import stats 20 | 21 | >>> X = [4, 2, 5, 3, 0, 4] 22 | 23 | >>> mad = stats.MAD() 24 | >>> for x in X: 25 | ... mad.update(x) 26 | ... print(mad.get()) 27 | 0.0 28 | 2.0 29 | 1.0 30 | 1.0 31 | 1.0 32 | 1.0 33 | 34 | Attributes 35 | ---------- 36 | median : stats.Median 37 | The median of the data. 38 | 39 | References 40 | ---------- 41 | [^1]: [Median absolute deviation article on Wikipedia](https://www.wikiwand.com/en/Median_absolute_deviation) 42 | 43 | """ 44 | 45 | # 46 | def __init__(self): 47 | super().__init__(q=0.5) 48 | self.median = quantile.Quantile(q=0.5) 49 | 50 | def update(self, x): 51 | self.median.update(x) 52 | super().update(abs(x - self.median.get())) 53 | -------------------------------------------------------------------------------- /river/stats/minimum.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | 5 | from river import stats, utils 6 | 7 | 8 | class Min(stats.base.Univariate): 9 | """Running min. 10 | 11 | Attributes 12 | ---------- 13 | min : float 14 | The current min. 15 | 16 | """ 17 | 18 | def __init__(self): 19 | self.min = math.inf 20 | 21 | def update(self, x): 22 | if x < self.min: 23 | self.min = x 24 | 25 | def get(self): 26 | return self.min 27 | 28 | 29 | class RollingMin(stats.base.RollingUnivariate): 30 | """Running min over a window. 31 | 32 | Parameters 33 | ---------- 34 | window_size 35 | Size of the rolling window. 36 | 37 | Examples 38 | -------- 39 | 40 | >>> from river import stats 41 | 42 | >>> X = [1, -4, 3, -2, 2, 1] 43 | >>> rolling_min = stats.RollingMin(2) 44 | >>> for x in X: 45 | ... rolling_min.update(x) 46 | ... print(rolling_min.get()) 47 | 1 48 | -4 49 | -4 50 | -2 51 | -2 52 | 1 53 | 54 | """ 55 | 56 | def __init__(self, window_size: int): 57 | self.window = utils.SortedWindow(size=window_size) 58 | 59 | @property 60 | def window_size(self): 61 | return self.window.size 62 | 63 | def update(self, x): 64 | self.window.append(x) 65 | 66 | def get(self): 67 | try: 68 | return self.window[0] 69 | except IndexError: 70 | return None 71 | -------------------------------------------------------------------------------- /river/stats/sem.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from . import var 4 | 5 | 6 | class SEM(var.Var): 7 | """Running standard error of the mean using Welford's algorithm. 8 | 9 | Parameters 10 | ---------- 11 | ddof 12 | Delta Degrees of Freedom. The divisor used in calculations is `n - ddof`, where `n` is the 13 | number of seen elements. 14 | 15 | Attributes 16 | ---------- 17 | n : int 18 | Number of observations. 19 | 20 | Examples 21 | -------- 22 | 23 | >>> from river import stats 24 | 25 | >>> X = [3, 5, 4, 7, 10, 12] 26 | 27 | >>> sem = stats.SEM() 28 | >>> for x in X: 29 | ... sem.update(x) 30 | ... print(sem.get()) 31 | 0.0 32 | 1.0 33 | 0.577350 34 | 0.853912 35 | 1.240967 36 | 1.447219 37 | 38 | >>> from river import utils 39 | 40 | >>> X = [1, 4, 2, -4, -8, 0] 41 | 42 | >>> rolling_sem = utils.Rolling(stats.SEM(ddof=1), window_size=3) 43 | >>> for x in X: 44 | ... rolling_sem.update(x) 45 | ... print(rolling_sem.get()) 46 | 0.0 47 | 1.5 48 | 0.881917 49 | 2.403700 50 | 2.905932 51 | 2.309401 52 | 53 | References 54 | ---------- 55 | [^1]: [Wikipedia article on algorithms for calculating variance](https://www.wikiwand.com/en/Algorithms_for_calculating_variance#/Covariance) 56 | 57 | """ 58 | 59 | def get(self): 60 | try: 61 | return (super().get() / self.mean.n) ** 0.5 62 | except ZeroDivisionError: 63 | return None 64 | -------------------------------------------------------------------------------- /river/stats/summing.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import stats 4 | 5 | 6 | class Sum(stats.base.Univariate): 7 | """Running sum. 8 | 9 | Attributes 10 | ---------- 11 | sum : float 12 | The running sum. 13 | 14 | Examples 15 | -------- 16 | 17 | >>> from river import stats 18 | 19 | >>> X = [-5, -3, -1, 1, 3, 5] 20 | >>> mean = stats.Sum() 21 | >>> for x in X: 22 | ... mean.update(x) 23 | ... print(mean.get()) 24 | -5.0 25 | -8.0 26 | -9.0 27 | -8.0 28 | -5.0 29 | 0.0 30 | 31 | >>> from river import utils 32 | 33 | >>> X = [1, -4, 3, -2, 2, 1] 34 | >>> rolling_sum = utils.Rolling(stats.Sum(), window_size=2) 35 | >>> for x in X: 36 | ... rolling_sum.update(x) 37 | ... print(rolling_sum.get()) 38 | 1.0 39 | -3.0 40 | -1.0 41 | 1.0 42 | 0.0 43 | 3.0 44 | 45 | """ 46 | 47 | def __init__(self): 48 | self.sum = 0.0 49 | 50 | def update(self, x): 51 | self.sum += x 52 | 53 | def revert(self, x): 54 | self.sum -= x 55 | 56 | def get(self): 57 | return self.sum 58 | -------------------------------------------------------------------------------- /river/stats/test_kolmogorov_smirnov.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from collections import deque 4 | 5 | import numpy as np 6 | from scipy.stats import ks_2samp 7 | 8 | from river import stats 9 | 10 | 11 | def test_incremental_ks_statistics(): 12 | initial_a = np.random.normal(loc=0, scale=1, size=500) 13 | initial_b = np.random.normal(loc=1, scale=1, size=500) 14 | 15 | stream_a = np.random.normal(loc=0, scale=1, size=5000) 16 | stream_b = np.random.normal(loc=1, scale=1, size=5000) 17 | 18 | incremental_ks_statistics = [] 19 | incremental_ks = stats.KolmogorovSmirnov(statistic="ks") 20 | sliding_a = deque(initial_a) 21 | sliding_b = deque(initial_b) 22 | 23 | for a, b in zip(initial_a, initial_b): 24 | incremental_ks.update(a, b) 25 | for a, b in zip(stream_a, stream_b): 26 | incremental_ks.revert(sliding_a.popleft(), sliding_b.popleft()) 27 | sliding_a.append(a) 28 | sliding_b.append(b) 29 | incremental_ks.update(a, b) 30 | incremental_ks_statistics.append(incremental_ks.get()) 31 | 32 | ks_2samp_statistics = [] 33 | sliding_a = deque(initial_a) 34 | sliding_b = deque(initial_b) 35 | 36 | for a, b in zip(stream_a, stream_b): 37 | sliding_a.popleft() 38 | sliding_b.popleft() 39 | sliding_a.append(a) 40 | sliding_b.append(b) 41 | ks_2samp_statistics.append(ks_2samp(sliding_a, sliding_b).statistic) 42 | 43 | assert np.allclose(np.array(incremental_ks_statistics), np.array(ks_2samp_statistics)) 44 | 45 | assert incremental_ks._test_ks_threshold(ca=incremental_ks._ca(p_value=0.05)) is True 46 | -------------------------------------------------------------------------------- /river/stats/test_quantile.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import random 4 | 5 | from river import stats 6 | 7 | 8 | def test_issue_1178(): 9 | """ 10 | 11 | https://github.com/online-ml/river/issues/1178 12 | 13 | >>> from river import stats 14 | 15 | >>> q = stats.Quantile(0.01) 16 | >>> for x in [5, 0, 0, 0, 0, 0, 0, 0]: 17 | ... q.update(x) 18 | ... print(q) 19 | Quantile: 5. 20 | Quantile: 0. 21 | Quantile: 0. 22 | Quantile: 0. 23 | Quantile: 0. 24 | Quantile: 0. 25 | Quantile: 0. 26 | Quantile: 0. 27 | 28 | >>> q = stats.Quantile(0.99) 29 | >>> for x in [5, 0, 0, 0, 0, 0, 0, 0]: 30 | ... q.update(x) 31 | ... print(q) 32 | Quantile: 5. 33 | Quantile: 5. 34 | Quantile: 5. 35 | Quantile: 5. 36 | Quantile: 5. 37 | Quantile: 0. 38 | Quantile: 0.277778 39 | Quantile: 0.827546 40 | 41 | """ 42 | 43 | 44 | def test_ge(): 45 | low = stats.Quantile(0.01) 46 | high = stats.Quantile(0.99) 47 | 48 | for _ in range(100): 49 | x = random.random() 50 | low.update(x) 51 | high.update(x) 52 | assert high.get() >= low.get() 53 | -------------------------------------------------------------------------------- /river/stats/test_var.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | import random 5 | 6 | from river import stats 7 | 8 | 9 | def test_weighted_variance_with_close_numbers(): 10 | """ 11 | 12 | Origin of this test: https://github.com/online-ml/river/issues/732 13 | 14 | This test would fail if Var were implemented with a numerically unstable algorithm. 15 | 16 | """ 17 | 18 | D = [ 19 | (99.99999978143265, 6), 20 | (99.99999989071631, 8), 21 | (99.99999994535816, 6), 22 | (99.99999997267908, 9), 23 | (99.99999998633952, 10), 24 | (99.99999999316977, 3), 25 | (99.99999999829245, 5), 26 | (99.99999999957309, 9), 27 | ] 28 | 29 | var = stats.Var() 30 | 31 | for x, w in D: 32 | var.update(x, w) 33 | 34 | assert var.get() > 0 and math.isclose(var.get(), 4.648047194845607e-15) 35 | 36 | 37 | def test_revert(): 38 | for _ in range(5): 39 | X = [random.random() for _ in range(20)] 40 | 41 | v1 = stats.Var() 42 | v2 = stats.Var() 43 | 44 | for x in X[:10]: 45 | v1.update(x) 46 | v2.update(x) 47 | 48 | for x in X[10:]: 49 | v2.update(x) 50 | for x in X[10:]: 51 | v2.revert(x) 52 | 53 | assert math.isclose(v1.get(), v2.get()) 54 | -------------------------------------------------------------------------------- /river/stream/__init__.py: -------------------------------------------------------------------------------- 1 | """Streaming utilities. 2 | 3 | The module includes tools to iterate over data streams. 4 | 5 | """ 6 | 7 | from __future__ import annotations 8 | 9 | from .cache import Cache 10 | from .iter_arff import iter_arff 11 | from .iter_array import iter_array 12 | from .iter_csv import iter_csv 13 | from .iter_libsvm import iter_libsvm 14 | from .qa import simulate_qa 15 | from .shuffling import shuffle 16 | from .tweet_stream import TwitterLiveStream 17 | from .twitch_chat_stream import TwitchChatStream 18 | 19 | __all__ = [ 20 | "Cache", 21 | "iter_arff", 22 | "iter_array", 23 | "iter_csv", 24 | "iter_libsvm", 25 | "simulate_qa", 26 | "shuffle", 27 | "TwitterLiveStream", 28 | "TwitchChatStream", 29 | ] 30 | 31 | try: 32 | from .iter_polars import iter_polars 33 | 34 | __all__ += ["iter_polars"] 35 | except ImportError: 36 | pass 37 | 38 | try: 39 | from .iter_pandas import iter_pandas 40 | 41 | __all__ += ["iter_pandas"] 42 | except ImportError: 43 | pass 44 | 45 | try: 46 | from .iter_sklearn import iter_sklearn_dataset 47 | 48 | __all__ += ["iter_sklearn_dataset"] 49 | except ImportError: 50 | pass 51 | 52 | try: 53 | from .iter_sql import iter_sql 54 | 55 | __all__ += ["iter_sql"] 56 | except ImportError: 57 | pass 58 | 59 | try: 60 | from .iter_vaex import iter_vaex 61 | 62 | __all__ += ["iter_vaex"] 63 | except ImportError: 64 | pass 65 | -------------------------------------------------------------------------------- /river/stream/iter_pandas.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pandas as pd 4 | 5 | from river import base, stream 6 | 7 | 8 | def iter_pandas( 9 | X: pd.DataFrame, y: pd.Series | pd.DataFrame | None = None, **kwargs 10 | ) -> base.typing.Stream: 11 | """Iterates over the rows of a `pandas.DataFrame`. 12 | 13 | Parameters 14 | ---------- 15 | X 16 | A dataframe of features. 17 | y 18 | A series or a dataframe with one column per target. 19 | kwargs 20 | Extra keyword arguments are passed to the underlying call to `stream.iter_array`. 21 | 22 | Examples 23 | -------- 24 | 25 | >>> import pandas as pd 26 | >>> from river import stream 27 | 28 | >>> X = pd.DataFrame({ 29 | ... 'x1': [1, 2, 3, 4], 30 | ... 'x2': ['blue', 'yellow', 'yellow', 'blue'], 31 | ... 'y': [True, False, False, True] 32 | ... }) 33 | >>> y = X.pop('y') 34 | 35 | >>> for xi, yi in stream.iter_pandas(X, y): 36 | ... print(xi, yi) 37 | {'x1': 1, 'x2': 'blue'} True 38 | {'x1': 2, 'x2': 'yellow'} False 39 | {'x1': 3, 'x2': 'yellow'} False 40 | {'x1': 4, 'x2': 'blue'} True 41 | 42 | """ 43 | 44 | kwargs["feature_names"] = X.columns 45 | if isinstance(y, pd.DataFrame): 46 | kwargs["target_names"] = y.columns 47 | 48 | yield from stream.iter_array(X=X.to_numpy(), y=y if y is None else y.to_numpy(), **kwargs) 49 | -------------------------------------------------------------------------------- /river/stream/iter_polars.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import polars as pl 4 | 5 | from river import base, stream 6 | 7 | 8 | def iter_polars( 9 | X: pl.DataFrame, y: pl.Series | pl.DataFrame | None = None, **kwargs 10 | ) -> base.typing.Stream: 11 | """Iterates over the rows of a `polars.DataFrame`. 12 | 13 | Parameters 14 | ---------- 15 | X 16 | A dataframe of features. 17 | y 18 | A series or a dataframe with one column per target. 19 | kwargs 20 | Extra keyword arguments are passed to the underlying call to `stream.iter_array`. 21 | 22 | Examples 23 | -------- 24 | 25 | >>> import polars as pl 26 | >>> from river import stream 27 | 28 | >>> X = pl.DataFrame({ 29 | ... 'x1': [1, 2, 3, 4], 30 | ... 'x2': ['blue', 'yellow', 'yellow', 'blue'], 31 | ... 'y': [True, False, False, True] 32 | ... }) 33 | >>> y = X.get_column('y') 34 | >>> X=X.drop("y") 35 | 36 | >>> for xi, yi in stream.iter_polars(X, y): 37 | ... print(xi, yi) 38 | {'x1': 1, 'x2': 'blue'} True 39 | {'x1': 2, 'x2': 'yellow'} False 40 | {'x1': 3, 'x2': 'yellow'} False 41 | {'x1': 4, 'x2': 'blue'} True 42 | 43 | """ 44 | 45 | kwargs["feature_names"] = X.columns 46 | if isinstance(y, pl.DataFrame): 47 | kwargs["target_names"] = y.columns 48 | 49 | yield from stream.iter_array(X=X.to_numpy(), y=y if y is None else y.to_numpy(), **kwargs) 50 | -------------------------------------------------------------------------------- /river/stream/iter_vaex.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import vaex 4 | from vaex.utils import _ensure_list, _ensure_strings_from_expressions 5 | 6 | from river import base 7 | 8 | 9 | def iter_vaex( 10 | X: vaex.dataframe.DataFrame, 11 | y: str | vaex.expression.Expression | None = None, 12 | features: list[str] | vaex.expression.Expression | None = None, 13 | ) -> base.typing.Stream: 14 | """Yields rows from a ``vaex.DataFrame``. 15 | 16 | Parameters 17 | ---------- 18 | X 19 | A vaex DataFrame housing the training features. 20 | y 21 | The column or expression containing the target variable. 22 | features 23 | A list of features used for training. If None, all columns in `X` will be used. Features 24 | specifying in `y` are ignored. 25 | 26 | """ 27 | 28 | features = _ensure_strings_from_expressions(features) 29 | feature_names = features or X.get_column_names() 30 | 31 | if y: 32 | y = _ensure_strings_from_expressions(y) 33 | y = _ensure_list(y) 34 | feature_names = [feat for feat in feature_names if feat not in y] 35 | 36 | multioutput = len(y) > 1 37 | 38 | if multioutput: 39 | for i in range(len(X)): 40 | yield ( 41 | {key: X.evaluate(key, i, i + 1)[0] for key in feature_names}, 42 | {key: X.evaluate(key, i, i + 1)[0] for key in y}, 43 | ) 44 | 45 | else: 46 | for i in range(len(X)): 47 | yield ( 48 | {key: X.evaluate(key, i, i + 1)[0] for key in feature_names}, 49 | X.evaluate(y[0], i, i + 1)[0], 50 | ) 51 | -------------------------------------------------------------------------------- /river/stream/pokedb.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/online-ml/river/9e2ceca900ba53f0ee710a6e69f972b05f74d43a/river/stream/pokedb.zip -------------------------------------------------------------------------------- /river/stream/test_iter_csv.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import io 4 | 5 | from river import stream 6 | 7 | 8 | def test_iter_csv_custom_converter(): 9 | example = io.StringIO("col1,col2,col3\n,1,2\n5,,4\n3,1,") 10 | 11 | def int_or_none(s): 12 | try: 13 | return int(s) 14 | except ValueError: 15 | return None 16 | 17 | params = {"converters": {"col1": int_or_none, "col2": int_or_none, "col3": int_or_none}} 18 | dataset = stream.iter_csv(example, **params) 19 | assert list(dataset) == [ 20 | ({"col1": None, "col2": 1, "col3": 2}, None), 21 | ({"col1": 5, "col2": None, "col3": 4}, None), 22 | ({"col1": 3, "col2": 1, "col3": None}, None), 23 | ] 24 | 25 | 26 | def test_iter_csv_drop_nones(): 27 | example = io.StringIO("col1,col2,col3\n,1,2\n5,,4\n3,1,") 28 | 29 | def int_or_none(s): 30 | try: 31 | return int(s) 32 | except ValueError: 33 | return None 34 | 35 | params = { 36 | "converters": {"col1": int_or_none, "col2": int_or_none, "col3": int_or_none}, 37 | "drop_nones": True, 38 | } 39 | dataset = stream.iter_csv(example, **params) 40 | assert list(dataset) == [ 41 | ({"col2": 1, "col3": 2}, None), 42 | ({"col1": 5, "col3": 4}, None), 43 | ({"col1": 3, "col2": 1}, None), 44 | ] 45 | -------------------------------------------------------------------------------- /river/stream/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import functools 4 | import gzip 5 | import io 6 | import os 7 | import zipfile 8 | 9 | 10 | def open_filepath(filepath_or_buffer, compression): 11 | # Determine the compression from the file extension if "infer" has been specified 12 | if compression == "infer": 13 | _, ext = os.path.splitext(filepath_or_buffer) 14 | compression = {".gz": "gzip", ".zip": "zip"}.get(ext) 15 | 16 | def open_zipfile(path): 17 | with zipfile.ZipFile(path, "r") as zf: 18 | f = zf.open(zf.namelist()[0], "r") 19 | f = io.TextIOWrapper(f) 20 | return f 21 | 22 | # Determine the file opening method from the compression 23 | open_func = { 24 | None: open, 25 | "gzip": functools.partial(gzip.open, mode="rt"), 26 | "zip": open_zipfile, 27 | }[compression] 28 | 29 | # Open the file using the opening method 30 | return open_func(filepath_or_buffer) 31 | -------------------------------------------------------------------------------- /river/time_series/__init__.py: -------------------------------------------------------------------------------- 1 | """Time series forecasting.""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import base 6 | from .evaluate import evaluate, iter_evaluate 7 | from .holt_winters import HoltWinters 8 | from .metrics import ForecastingMetric, HorizonAggMetric, HorizonMetric 9 | from .snarimax import SNARIMAX 10 | 11 | __all__ = [ 12 | "base", 13 | "evaluate", 14 | "iter_evaluate", 15 | "ForecastingMetric", 16 | "HorizonAggMetric", 17 | "HorizonMetric", 18 | "HoltWinters", 19 | "SNARIMAX", 20 | ] 21 | -------------------------------------------------------------------------------- /river/time_series/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | 5 | from river import base 6 | 7 | __all__ = ["Forecaster"] 8 | 9 | 10 | class Forecaster(base.Estimator): 11 | @property 12 | def _supervised(self): 13 | return True 14 | 15 | @abc.abstractmethod 16 | def learn_one(self, y: float, x: dict | None = None) -> None: 17 | """Updates the model. 18 | 19 | Parameters 20 | ---------- 21 | y 22 | In the literature this is called the endogenous variable. 23 | x 24 | Optional additional features to learn from. In the literature these are called the 25 | exogenous variables. 26 | 27 | """ 28 | raise NotImplementedError 29 | 30 | @abc.abstractmethod 31 | def forecast(self, horizon: int, xs: list[dict] | None = None) -> list: 32 | """Makes forecast at each step of the given horizon. 33 | 34 | Parameters 35 | ---------- 36 | horizon 37 | The number of steps ahead to forecast. 38 | xs 39 | The set of optional additional features. If given, then it's length should be equal to 40 | the horizon. 41 | 42 | """ 43 | -------------------------------------------------------------------------------- /river/time_series/test_evaluate.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river import datasets, metrics, stats, time_series 4 | 5 | 6 | class MeanForecaster(time_series.base.Forecaster): 7 | def __init__(self): 8 | self.mean = stats.Mean() 9 | 10 | def learn_one(self, y, x=None): 11 | self.mean.update(y) 12 | 13 | def forecast(self, horizon, xs=None): 14 | return [self.mean.get()] * horizon 15 | 16 | 17 | def test_forecasts_at_each_step(): 18 | dataset = datasets.AirlinePassengers() 19 | model = MeanForecaster() 20 | metric = metrics.MAE() 21 | horizon = 12 22 | grace_period = 1 23 | 24 | steps = time_series.iter_evaluate( 25 | dataset=dataset, model=model, metric=metric, horizon=horizon, grace_period=grace_period 26 | ) 27 | 28 | _, _, y_pred, _ = next(steps) 29 | assert y_pred == [112] * horizon 30 | _, _, y_pred, _ = next(steps) 31 | assert y_pred == [(112 + 118) / 2] * horizon 32 | _, _, y_pred, _ = next(steps) 33 | assert y_pred == [(112 + 118 + 132) / 3] * horizon 34 | _, _, y_pred, _ = next(steps) 35 | assert y_pred == [(112 + 118 + 132 + 129) / 4] * horizon 36 | 37 | n_steps = sum( 38 | 1 39 | for _ in time_series.iter_evaluate( 40 | dataset=dataset, model=model, metric=metric, horizon=horizon, grace_period=grace_period 41 | ) 42 | ) 43 | assert n_steps == dataset.n_samples - horizon - grace_period 44 | -------------------------------------------------------------------------------- /river/tree/losses.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | import math 5 | 6 | from .utils import GradHess 7 | 8 | 9 | class Loss(abc.ABC): 10 | """Base class to implement optimization objectives used in Stochastic Gradient Trees.""" 11 | 12 | @abc.abstractmethod 13 | def compute_derivatives(self, y_true: float, y_pred: float) -> GradHess: 14 | """Return the gradient and hessian data concerning one instance and its prediction. 15 | 16 | Parameters 17 | ---------- 18 | y_true 19 | Target value. 20 | y_pred 21 | Predicted target value. 22 | """ 23 | raise NotImplementedError 24 | 25 | def transfer(self, y: float) -> float: 26 | """Optionally apply some transformation to the value predicted by the tree before 27 | returning it. 28 | 29 | For instance, in classification, the softmax operation might be applied. 30 | 31 | Parameters 32 | ---------- 33 | y 34 | Value to be transformed. 35 | """ 36 | return y 37 | 38 | 39 | class BinaryCrossEntropyLoss(Loss): 40 | """Loss function used in binary classification tasks.""" 41 | 42 | def compute_derivatives(self, y_true, y_pred): 43 | y_trs = self.transfer(y_pred) 44 | 45 | return GradHess(y_trs - y_true, y_trs * (1.0 - y_trs)) 46 | 47 | def transfer(self, y): 48 | return 1.0 / (1.0 + math.exp(-y)) 49 | 50 | 51 | class SquaredErrorLoss(Loss): 52 | """Loss function used in regression tasks.""" 53 | 54 | def compute_derivatives(self, y_true, y_pred): 55 | return GradHess(y_pred - y_true, 1.0) 56 | -------------------------------------------------------------------------------- /river/tree/mondrian/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `river.tree.mondrian` module includes learning and split node 3 | implementations for the Mondrian trees. 4 | 5 | Note that this module is not exposed in the tree module, and is instead used by the 6 | AMFClassifier and AMFRegressor classes in the ensemble module. 7 | 8 | """ 9 | 10 | from __future__ import annotations 11 | 12 | from .mondrian_tree import MondrianTree 13 | from .mondrian_tree_classifier import MondrianTreeClassifier 14 | from .mondrian_tree_regressor import MondrianTreeRegressor 15 | 16 | __all__ = ["MondrianTree", "MondrianTreeClassifier", "MondrianTreeRegressor"] 17 | -------------------------------------------------------------------------------- /river/tree/mondrian/mondrian_tree.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | import random 5 | 6 | 7 | class MondrianTree(abc.ABC): 8 | """Base class for Mondrian Trees. 9 | 10 | This is an **abstract class**, so it cannot be used directly. It defines base operations 11 | and properties that all the Mondrian Trees must inherit or implement according to 12 | their own design. 13 | 14 | Parameters 15 | ---------- 16 | step 17 | Step parameter of the tree. 18 | loss 19 | Loss to minimize for each node of the tree. At the moment it is a placeholder. 20 | In the future, different optimization metrics might become available. 21 | use_aggregation 22 | Whether or not the tree should it use aggregation. 23 | iteration 24 | Number of iterations to run when training. 25 | seed 26 | Random seed for reproducibility. 27 | 28 | """ 29 | 30 | def __init__( 31 | self, 32 | step: float = 0.1, 33 | loss: str = "log", 34 | use_aggregation: bool = True, 35 | iteration: int = 0, 36 | seed: int | None = None, 37 | ): 38 | # Properties common to all the Mondrian Trees 39 | self.step = step 40 | self.loss = loss 41 | self.use_aggregation = use_aggregation 42 | self.iteration = iteration 43 | 44 | # Controls the randomness in the tree 45 | self.seed = seed 46 | self._rng = random.Random(seed) 47 | -------------------------------------------------------------------------------- /river/tree/nodes/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The `river.tree.nodes` module includes learning and split node 3 | implementations for the hoeffding trees. 4 | """ 5 | -------------------------------------------------------------------------------- /river/tree/setup.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from numpy.distutils.misc_util import Configuration 4 | 5 | 6 | def configuration(parent_package="", top_path=None): 7 | config = Configuration("tree", parent_package, top_path) 8 | 9 | # submodules which do not have their own setup.py 10 | config.add_subpackage("splitter") 11 | config.add_subpackage("nodes") 12 | config.add_subpackage("split_criterion") 13 | 14 | return config 15 | 16 | 17 | if __name__ == "__main__": 18 | from numpy.distutils.core import setup 19 | 20 | setup(**configuration().todict()) 21 | -------------------------------------------------------------------------------- /river/tree/split_criterion/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .gini_split_criterion import GiniSplitCriterion 4 | from .hellinger_distance_criterion import HellingerDistanceCriterion 5 | from .info_gain_split_criterion import InfoGainSplitCriterion 6 | from .intra_cluster_variance_reduction_split_criterion import ( 7 | IntraClusterVarianceReductionSplitCriterion, 8 | ) 9 | from .variance_ratio_split_criterion import VarianceRatioSplitCriterion 10 | from .variance_reduction_split_criterion import VarianceReductionSplitCriterion 11 | 12 | __all__ = [ 13 | "GiniSplitCriterion", 14 | "HellingerDistanceCriterion", 15 | "InfoGainSplitCriterion", 16 | "IntraClusterVarianceReductionSplitCriterion", 17 | "VarianceRatioSplitCriterion", 18 | "VarianceReductionSplitCriterion", 19 | ] 20 | -------------------------------------------------------------------------------- /river/tree/split_criterion/base.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import abc 4 | 5 | 6 | class SplitCriterion(abc.ABC): 7 | """SplitCriterion 8 | 9 | Abstract class for computing splitting criteria with respect to distributions of class values. 10 | The split criterion is used as a parameter on decision trees and decision stumps. 11 | 12 | This class should not me instantiated, as none of its methods are implemented. 13 | 14 | """ 15 | 16 | def __init__(self): 17 | super().__init__() 18 | 19 | @abc.abstractmethod 20 | def merit_of_split(self, pre_split_dist, post_split_dist): 21 | """Compute the merit of splitting for a given distribution before the split and after it. 22 | 23 | Parameters 24 | ---------- 25 | pre_split_dist 26 | The target statistics before the split. 27 | post_split_dist 28 | the target statistics after the split. 29 | 30 | Returns 31 | ------- 32 | Value of the merit of splitting 33 | """ 34 | 35 | @abc.abstractmethod 36 | def current_merit(self, dist): 37 | """Compute the merit of the distribution. 38 | 39 | Parameters 40 | ---------- 41 | dist 42 | The data distribution. 43 | 44 | Returns 45 | ------- 46 | Value of merit of the distribution according to the splitting criterion 47 | """ 48 | 49 | @staticmethod 50 | @abc.abstractmethod 51 | def range_of_merit(pre_split_dist): 52 | """Compute the range of splitting merit. 53 | 54 | Parameters 55 | ---------- 56 | pre_split_dist 57 | The target statistics before the split. 58 | 59 | Returns 60 | ------- 61 | Value of the range of splitting merit 62 | """ 63 | -------------------------------------------------------------------------------- /river/tree/split_criterion/intra_cluster_variance_reduction_split_criterion.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .variance_reduction_split_criterion import VarianceReductionSplitCriterion 4 | 5 | 6 | # This class extends VarianceReductionSplitCriterion since it just computes 7 | # the variance differently than its ancestor (considering multiple targets) 8 | class IntraClusterVarianceReductionSplitCriterion(VarianceReductionSplitCriterion): 9 | def __init__(self, min_samples_split: int = 5): 10 | super().__init__(min_samples_split) 11 | 12 | def merit_of_split(self, pre_split_dist, post_split_dist): 13 | icvr = 0.0 14 | n = list(pre_split_dist.values())[0].mean.n 15 | 16 | count = 0 17 | 18 | for dist in post_split_dist: 19 | n_i = list(dist.values())[0].mean.n 20 | if n_i >= self.min_samples_split: 21 | count += 1 22 | 23 | if count == len(post_split_dist): 24 | icvr = self.compute_var(pre_split_dist) 25 | for dist in post_split_dist: 26 | n_i = list(dist.values())[0].mean.n 27 | icvr -= n_i / n * self.compute_var(dist) 28 | return icvr 29 | 30 | def current_merit(self, dist): 31 | return self.compute_var(dist) 32 | 33 | @staticmethod 34 | def compute_var(dist): 35 | icvr = [vr.get() for vr in dist.values()] 36 | n = len(icvr) 37 | return sum(icvr) / n if n > 0 else 0.0 38 | -------------------------------------------------------------------------------- /river/tree/splitter/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module implements the Attribute Observers (AO) (or tree splitters) that are used by the 3 | Hoeffding Trees (HT). It also implements the feature quantizers (FQ) used by Stochastic Gradient 4 | Trees (SGT). AOs are a core aspect of the HTs construction, and might represent one of the major 5 | bottlenecks when building the trees. The same holds for SGTs and FQs. The correct choice and setup 6 | of a splitter might result in significant differences in the running time and memory usage of the 7 | incremental decision trees. 8 | 9 | AOs for classification and regression trees can be differentiated by using the property 10 | `is_target_class` (`True` for splitters designed to classification tasks). An error will be raised 11 | if one tries to use a classification splitter in a regression tree and vice-versa. 12 | Lastly, AOs cannot be used in SGT and FQs cannot be used in Hoeffding Trees. So, care must be taken 13 | when choosing the correct feature splitter. 14 | 15 | """ 16 | 17 | from __future__ import annotations 18 | 19 | from .base import Quantizer, Splitter 20 | from .ebst_splitter import EBSTSplitter 21 | from .exhaustive_splitter import ExhaustiveSplitter 22 | from .gaussian_splitter import GaussianSplitter 23 | from .histogram_splitter import HistogramSplitter 24 | from .qo_splitter import QOSplitter 25 | from .sgt_quantizer import DynamicQuantizer, StaticQuantizer 26 | from .tebst_splitter import TEBSTSplitter 27 | 28 | __all__ = [ 29 | "DynamicQuantizer", 30 | "EBSTSplitter", 31 | "ExhaustiveSplitter", 32 | "GaussianSplitter", 33 | "HistogramSplitter", 34 | "QOSplitter", 35 | "Quantizer", 36 | "Splitter", 37 | "StaticQuantizer", 38 | "TEBSTSplitter", 39 | ] 40 | -------------------------------------------------------------------------------- /river/tree/splitter/tebst_splitter.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from .ebst_splitter import EBSTSplitter 4 | 5 | 6 | class TEBSTSplitter(EBSTSplitter): 7 | """Truncated E-BST. 8 | 9 | Variation of E-BST that rounds the incoming feature values before passing them to the binary 10 | search tree (BST). By doing so, the attribute observer might reduce its processing time and 11 | memory usage since small variations in the input values will end up being mapped to the same 12 | BST node. 13 | 14 | Parameters 15 | ---------- 16 | digits 17 | The number of decimal places used to round the input feature values. 18 | 19 | """ 20 | 21 | def __init__(self, digits: int = 1): 22 | super().__init__() 23 | self.digits = digits 24 | 25 | def update(self, att_val, target_val, w): 26 | try: 27 | att_val = round(att_val, self.digits) 28 | super().update(att_val, target_val, w) 29 | except TypeError: # feature value is None 30 | pass 31 | 32 | def cond_proba(self, att_val, target_val): 33 | """Not implemented in regression splitters.""" 34 | raise NotImplementedError 35 | -------------------------------------------------------------------------------- /river/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Shared utility classes and functions""" 2 | 3 | from __future__ import annotations 4 | 5 | from . import inspect, math, norm, pretty, random 6 | from .context_managers import log_method_calls 7 | from .param_grid import expand_param_grid 8 | from .rolling import Rolling, TimeRolling 9 | from .sorted_window import SortedWindow 10 | from .vectordict import VectorDict 11 | 12 | __all__ = [ 13 | "expand_param_grid", 14 | "inspect", 15 | "log_method_calls", 16 | "math", 17 | "pretty", 18 | "random", 19 | "norm", 20 | "Rolling", 21 | "SortedWindow", 22 | "VectorDict", 23 | "TimeRolling", 24 | ] 25 | -------------------------------------------------------------------------------- /river/utils/context_managers.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from river.base.base import log_method_calls 4 | 5 | __all__ = ["log_method_calls"] 6 | -------------------------------------------------------------------------------- /river/utils/random.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import math 4 | import random 5 | 6 | __all__ = ["poisson", "exponential"] 7 | 8 | 9 | def poisson(rate: float, rng=random) -> int: 10 | """Sample a random value from a Poisson distribution. 11 | 12 | Parameters 13 | ---------- 14 | rate 15 | rng 16 | 17 | References 18 | ---------- 19 | [^1] [Wikipedia article](https://www.wikiwand.com/en/Poisson_distribution#/Generating_Poisson-distributed_random_variables) 20 | 21 | """ 22 | 23 | L = math.exp(-rate) 24 | k = 0 25 | p = 1 26 | 27 | while p > L: 28 | k += 1 29 | p *= rng.random() 30 | 31 | return k - 1 32 | 33 | 34 | def exponential(rate: float = 1.0, rng=random) -> float: 35 | """Sample a random value from a Poisson distribution. 36 | 37 | Parameters 38 | ---------- 39 | rate 40 | rng 41 | 42 | References 43 | ---------- 44 | [^1]: [Wikipedia article](https://www.wikiwand.com/en/Exponential_distribution#Random_variate_generation) 45 | 46 | """ 47 | 48 | u = rng.random() 49 | 50 | # Retrieve the λ value from the rate (β): β = 1 / λ 51 | lmbda = 1.0 / rate 52 | return -math.log(1 - u) / lmbda 53 | -------------------------------------------------------------------------------- /river/utils/sorted_window.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import bisect 4 | import collections 5 | 6 | 7 | class SortedWindow(collections.UserList): 8 | """Sorted running window data structure. 9 | 10 | Parameters 11 | ---------- 12 | size 13 | Size of the window to compute the rolling quantile. 14 | 15 | Examples 16 | -------- 17 | 18 | >>> from river import utils 19 | 20 | >>> window = utils.SortedWindow(size=3) 21 | 22 | >>> for i in reversed(range(9)): 23 | ... window.append(i) 24 | ... print(window) 25 | [8] 26 | [7, 8] 27 | [6, 7, 8] 28 | [5, 6, 7] 29 | [4, 5, 6] 30 | [3, 4, 5] 31 | [2, 3, 4] 32 | [1, 2, 3] 33 | [0, 1, 2] 34 | 35 | References 36 | ---------- 37 | [^1]: [Left sorted inserts in Python](https://stackoverflow.com/questions/8024571/insert-an-item-into-sorted-list-in-python) 38 | 39 | """ 40 | 41 | def __init__(self, size: int): 42 | super().__init__() 43 | self.unsorted_window: collections.deque = collections.deque(maxlen=size) 44 | 45 | @property 46 | def size(self): 47 | return self.unsorted_window.maxlen 48 | 49 | def append(self, x) -> None: 50 | if len(self) >= self.size: 51 | # The window is sorted, and a binary search is more optimized than linear search 52 | start_deque = bisect.bisect_left(self, self.unsorted_window[0]) 53 | del self[start_deque] 54 | 55 | bisect.insort_left(self, x) 56 | self.unsorted_window.append(x) 57 | -------------------------------------------------------------------------------- /river/utils/test_rolling.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import datetime as dt 4 | 5 | import pytest 6 | 7 | from river import proba, stats, utils 8 | 9 | 10 | def test_with_counter(): 11 | """ 12 | >>> from river import utils 13 | >>> import collections 14 | >>> collections.Counter.revert = collections.Counter.subtract 15 | 16 | >>> counter = utils.Rolling(collections.Counter(), window_size=3) 17 | 18 | >>> for i in range(5): 19 | ... counter.update([i]) 20 | 21 | >>> counter 22 | Counter({2: 1, 3: 1, 4: 1, 0: 0, 1: 0}) 23 | 24 | >>> counter.most_common(3) 25 | [(2, 1), (3, 1), (4, 1)] 26 | 27 | >>> counter[4] 28 | 1 29 | 30 | """ 31 | 32 | 33 | def test_rolling_with_not_rollable(): 34 | with pytest.raises(ValueError): 35 | utils.Rolling(stats.Quantile(), window_size=10) 36 | 37 | 38 | def test_time_rolling_with_not_rollable(): 39 | with pytest.raises(ValueError): 40 | utils.TimeRolling(stats.Quantile(), period=dt.timedelta(seconds=10)) 41 | 42 | 43 | def test_issue_1343(): 44 | """ 45 | 46 | https://github.com/online-ml/river/issues/1343 47 | 48 | """ 49 | rmean = utils.TimeRolling(proba.MultivariateGaussian(), period=dt.timedelta(microseconds=1)) 50 | t = dt.datetime.now() 51 | rmean.update({"a": 0}, t=t) 52 | rmean.update({"a": 1}, t=t) 53 | --------------------------------------------------------------------------------