├── .gitignore ├── test ├── recsys │ ├── __init__.py │ ├── graphs │ │ ├── __init__.py │ │ ├── feature_selection │ │ │ └── __init__.py │ │ ├── test_networkx_implementation │ │ │ └── __init__.py │ │ └── test_graph.py │ ├── content_based_algorithm │ │ ├── __init__.py │ │ ├── classifier │ │ │ └── __init__.py │ │ ├── regressor │ │ │ └── __init__.py │ │ ├── centroid_vector │ │ │ ├── __init__.py │ │ │ └── test_similarities.py │ │ ├── index_query │ │ │ └── __init__.py │ │ └── test_contents_loader.py │ ├── graph_based_algorithm │ │ ├── __init__.py │ │ ├── page_rank │ │ │ └── __init__.py │ │ └── test_graph_based_algorithm.py │ └── visual_based_algorithm │ │ ├── __init__.py │ │ └── vbpr │ │ └── __init__.py ├── evaluation │ ├── __init__.py │ ├── metrics │ │ └── __init__.py │ └── eval_pipeline_modules │ │ └── __init__.py ├── content_analyzer │ ├── utils │ │ ├── __init__.py │ │ ├── test_check_tokenization.py │ │ └── test_id_merger.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── embedding_learner │ │ │ ├── __init__.py │ │ │ ├── test_doc2vec.py │ │ │ ├── test_word2vec.py │ │ │ ├── test_fasttext.py │ │ │ ├── test_random_indexing.py │ │ │ ├── test_lda.py │ │ │ └── test_latent_semantic_analysis.py │ │ ├── embedding_loader │ │ │ ├── __init__.py │ │ │ ├── test_sbert.py │ │ │ └── test_gensim_loader.py │ │ └── test_embedding_source.py │ ├── memory_interfaces │ │ └── __init__.py │ ├── ratings_manager │ │ ├── __init__.py │ │ ├── test_sentiment_analysis.py │ │ └── test_rating_processor.py │ ├── information_processor │ │ ├── __init__.py │ │ ├── test_visualpostprocessors │ │ │ └── __init__.py │ │ └── test_visualpreprocessors │ │ │ ├── __init__.py │ │ │ └── test_torch_builtin_augmenter.py │ ├── field_content_production_techniques │ │ ├── __init__.py │ │ ├── visual_technique │ │ │ └── __init__.py │ │ ├── embedding_technique │ │ │ ├── __init__.py │ │ │ └── test_combining_technique.py │ │ ├── test_int.json │ │ ├── test_synset_document_frequency.py │ │ └── test_tf_idf.py │ ├── __init__.py │ ├── content_representation │ │ ├── __init__.py │ │ └── test_representation_container.py │ └── test_config.py ├── test_files │ ├── complex_contents │ │ ├── index │ │ │ ├── MAIN_WRITELOCK │ │ │ ├── _MAIN_2.toc │ │ │ └── MAIN_hlk83r2qer820iyx.seg │ │ ├── users_codified │ │ │ ├── 1.xz │ │ │ ├── 10.xz │ │ │ ├── 11.xz │ │ │ ├── 12.xz │ │ │ ├── 13.xz │ │ │ ├── 14.xz │ │ │ ├── 15.xz │ │ │ ├── 16.xz │ │ │ ├── 17.xz │ │ │ ├── 18.xz │ │ │ ├── 19.xz │ │ │ ├── 2.xz │ │ │ ├── 20.xz │ │ │ ├── 21.xz │ │ │ ├── 22.xz │ │ │ ├── 23.xz │ │ │ ├── 24.xz │ │ │ ├── 25.xz │ │ │ ├── 26.xz │ │ │ ├── 27.xz │ │ │ ├── 28.xz │ │ │ ├── 29.xz │ │ │ ├── 3.xz │ │ │ ├── 30.xz │ │ │ ├── 31.xz │ │ │ ├── 32.xz │ │ │ ├── 33.xz │ │ │ ├── 34.xz │ │ │ ├── 35.xz │ │ │ ├── 36.xz │ │ │ ├── 37.xz │ │ │ ├── 38.xz │ │ │ ├── 39.xz │ │ │ ├── 4.xz │ │ │ ├── 40.xz │ │ │ ├── 41.xz │ │ │ ├── 42.xz │ │ │ ├── 43.xz │ │ │ ├── 44.xz │ │ │ ├── 45.xz │ │ │ ├── 46.xz │ │ │ ├── 47.xz │ │ │ ├── 48.xz │ │ │ ├── 49.xz │ │ │ ├── 5.xz │ │ │ ├── 50.xz │ │ │ ├── 51.xz │ │ │ ├── 52.xz │ │ │ ├── 53.xz │ │ │ ├── 54.xz │ │ │ ├── 55.xz │ │ │ ├── 56.xz │ │ │ ├── 57.xz │ │ │ ├── 58.xz │ │ │ ├── 59.xz │ │ │ ├── 6.xz │ │ │ ├── 60.xz │ │ │ ├── 61.xz │ │ │ ├── 62.xz │ │ │ ├── 63.xz │ │ │ ├── 64.xz │ │ │ ├── 65.xz │ │ │ ├── 66.xz │ │ │ ├── 67.xz │ │ │ ├── 68.xz │ │ │ ├── 69.xz │ │ │ ├── 7.xz │ │ │ ├── 70.xz │ │ │ ├── 8.xz │ │ │ └── 9.xz │ │ ├── movies_codified │ │ │ ├── tt0112281.xz │ │ │ ├── tt0112302.xz │ │ │ ├── tt0112346.xz │ │ │ ├── tt0112453.xz │ │ │ ├── tt0112641.xz │ │ │ ├── tt0112760.xz │ │ │ ├── tt0112896.xz │ │ │ ├── tt0113041.xz │ │ │ ├── tt0113101.xz │ │ │ ├── tt0113189.xz │ │ │ ├── tt0113228.xz │ │ │ ├── tt0113277.xz │ │ │ ├── tt0113497.xz │ │ │ ├── tt0113845.xz │ │ │ ├── tt0113987.xz │ │ │ ├── tt0114319.xz │ │ │ ├── tt0114388.xz │ │ │ ├── tt0114576.xz │ │ │ ├── tt0114709.xz │ │ │ └── tt0114885.xz │ │ └── create_complex_contents.py │ ├── test_embedding_models │ │ ├── ri_model.model │ │ ├── doc2vec_model.kv │ │ ├── fasttext_model.kv │ │ ├── word2vec_model.kv │ │ └── lsa │ │ │ ├── lsa_model.model │ │ │ └── lsa_model.model.projection │ ├── random_tsv.tsv │ ├── test_images │ │ ├── images_files │ │ │ ├── o-neill-dress-black-and-white-164-1.jpg │ │ │ ├── wildfox-floral-print-leggings-357-1.jpg │ │ │ ├── anthropologie-skirt-light-pink-434-1.jpg │ │ │ ├── haute-hippie-top-white-and-black-1015-1.jpg │ │ │ └── elizabeth-and-james-top-ecru-and-pink-81-1.jpg │ │ ├── tradesy_small_local_relative_paths.json │ │ └── tradesy_small_online.json │ ├── d2v_test_data.json │ ├── users_info.json │ ├── test_dbpedia │ │ └── movies_info_reduced.json │ ├── test_ratings │ │ └── ratings_1591277020.csv │ ├── test_import_ratings.json │ ├── test_decode │ │ ├── movies_title_string.json │ │ ├── movies_title_tfidf.json │ │ └── movies_title_embedding.json │ ├── users_70.dat │ └── movies_info_reduced.csv ├── utils │ ├── __init__.py │ ├── test_load_content.py │ ├── test_class_utils.py │ ├── test_context_managers.py │ └── test_automatic_methods.py └── __init__.py ├── MANIFEST.in ├── clayrs ├── content_analyzer │ ├── utils │ │ ├── __init__.py │ │ ├── check_tokenization.py │ │ └── id_merger.py │ ├── content_representation │ │ └── __init__.py │ ├── information_processor │ │ ├── postprocessors │ │ │ └── __init__.py │ │ ├── visual_preprocessors │ │ │ ├── __init__.py │ │ │ └── torch_builtin_augmenter.py │ │ ├── __init__.py │ │ └── information_processor_abstract.py │ ├── memory_interfaces │ │ └── __init__.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── embedding_loader │ │ │ ├── __init__.py │ │ │ ├── gensim.py │ │ │ ├── sbert.py │ │ │ └── vector_strategy.py │ │ └── embedding_learner │ │ │ ├── __init__.py │ │ │ ├── word2vec.py │ │ │ ├── fasttext.py │ │ │ ├── doc2vec.py │ │ │ ├── lda.py │ │ │ ├── random_indexing.py │ │ │ └── latent_semantic_analysis.py │ ├── ratings_manager │ │ ├── __init__.py │ │ ├── sentiment_analysis.py │ │ └── score_processor.py │ ├── field_content_production_techniques │ │ ├── visual_techniques │ │ │ └── __init__.py │ │ ├── embedding_technique │ │ │ └── __init__.py │ │ └── __init__.py │ ├── __init__.py │ └── exceptions.py ├── evaluation │ ├── eval_pipeline_modules │ │ └── __init__.py │ ├── exceptions.py │ ├── __init__.py │ └── metrics │ │ ├── __init__.py │ │ └── metrics.py ├── recsys │ ├── visual_based_algorithm │ │ ├── __init__.py │ │ └── vbpr │ │ │ └── __init__.py │ ├── graph_based_algorithm │ │ ├── page_rank │ │ │ └── __init__.py │ │ └── __init__.py │ ├── content_based_algorithm │ │ ├── index_query │ │ │ └── __init__.py │ │ ├── centroid_vector │ │ │ ├── __init__.py │ │ │ └── similarities.py │ │ ├── classifier │ │ │ └── __init__.py │ │ ├── regressor │ │ │ └── __init__.py │ │ ├── __init__.py │ │ └── exceptions.py │ ├── graphs │ │ ├── __init__.py │ │ ├── feature_selection │ │ │ ├── exceptions.py │ │ │ └── __init__.py │ │ ├── nx_implementation │ │ │ └── __init__.py │ │ └── graph_metrics.py │ ├── __init__.py │ └── algorithm.py ├── __init__.py └── utils │ ├── __init__.py │ ├── const.py │ ├── automatic_methods.py │ ├── custom_logger.py │ ├── class_utils.py │ ├── load_content.py │ └── context_managers.py ├── setup.cfg ├── pyproject.toml ├── docs └── mkdocs │ ├── requirements-doc.txt │ └── docs │ ├── evaluation │ ├── eval_model.md │ ├── metrics │ │ ├── plot_metrics.md │ │ ├── error_metrics.md │ │ ├── ranking_metrics.md │ │ ├── fairness_metrics.md │ │ └── classification_metrics.md │ └── statistical_tests │ │ └── paired.md │ ├── img │ ├── colab_examples_1.png │ └── colab_examples_2.png │ ├── content_analyzer │ ├── index_interface.md │ ├── content_techniques │ │ ├── from_npy.md │ │ ├── textual_techniques │ │ │ ├── original_data.md │ │ │ ├── synset_df_frequency.md │ │ │ ├── tfidf.md │ │ │ └── embedding_techniques │ │ │ │ ├── document_embeddings.md │ │ │ │ ├── sentence_embeddings.md │ │ │ │ ├── contextualized_embeddings.md │ │ │ │ ├── word_embeddings.md │ │ │ │ └── combining_embeddings.md │ │ └── visual_techniques │ │ │ ├── high_level_visual.md │ │ │ └── low_level_visual.md │ ├── raw_sources.md │ ├── information_preprocessors │ │ ├── textual_preprocessors │ │ │ ├── nltk.md │ │ │ ├── spacy.md │ │ │ └── ekphrasis.md │ │ ├── visual_preprocessors │ │ │ └── torch_preprocessors.md │ │ └── postprocessors │ │ │ └── postprocessor.md │ ├── exogenous_techniques │ │ ├── babelfy.md │ │ ├── dbpedia.md │ │ └── properties_from_dataset.md │ ├── ratings │ │ ├── ratings.md │ │ └── score_processors.md │ └── config.md │ ├── utils │ └── report.md │ ├── recsys │ ├── methodology │ │ ├── all_items.md │ │ ├── test_items.md │ │ ├── test_ratings.md │ │ ├── training_items.md │ │ └── abstract_methodology.md │ ├── graph_based │ │ ├── graph_based_recsys.md │ │ ├── graph_based_algorithms │ │ │ └── nx_pagerank.md │ │ ├── graphs │ │ │ ├── nx_bipartite.md │ │ │ ├── nx_full.md │ │ │ ├── nx_tripartite.md │ │ │ └── nodes.md │ │ └── feature_selection.md │ ├── partitioning │ │ ├── kfold.md │ │ ├── bootstrap.md │ │ ├── hold_out.md │ │ └── abstract_partitioning.md │ ├── content_based │ │ ├── content_based_recsys.md │ │ ├── content_based_algorithms │ │ │ ├── index_query.md │ │ │ ├── linear_predictor.md │ │ │ ├── centroid_vector.md │ │ │ └── classifier_recommender.md │ │ └── visual_based_algorithms │ │ │ └── vbpr.md │ └── experiment.md │ ├── javascripts │ └── mathjax.js │ ├── first_steps │ ├── installation.md │ └── colab_examples.md │ └── index.md ├── codecov.yml ├── datasets └── ml-100k_extra_small │ ├── users_extra_small.csv │ └── ratings_extra_small.csv ├── .gitattributes ├── .coveragerc ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md └── workflows │ ├── docs_building.yml │ └── testing_pipeline.yml ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | -------------------------------------------------------------------------------- /test/recsys/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/evaluation/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/content_based_algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/graph_based_algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/graphs/feature_selection/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/visual_based_algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/memory_interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/ratings_manager/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/evaluation/eval_pipeline_modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/visual_based_algorithm/vbpr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/test_files/complex_contents/index/MAIN_WRITELOCK: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clayrs/evaluation/eval_pipeline_modules/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /test/content_analyzer/information_processor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/content_based_algorithm/classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/content_based_algorithm/regressor/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/graph_based_algorithm/page_rank/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_learner/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_loader/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/content_based_algorithm/centroid_vector/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/content_based_algorithm/index_query/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/recsys/graphs/test_networkx_implementation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clayrs/recsys/visual_based_algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | from .vbpr import * -------------------------------------------------------------------------------- /test/content_analyzer/field_content_production_techniques/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "Cython"] -------------------------------------------------------------------------------- /test/content_analyzer/information_processor/test_visualpostprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/information_processor/test_visualpreprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/content_representation/__init__.py: -------------------------------------------------------------------------------- 1 | from .content import Content -------------------------------------------------------------------------------- /test/content_analyzer/field_content_production_techniques/visual_technique/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /clayrs/recsys/visual_based_algorithm/vbpr/__init__.py: -------------------------------------------------------------------------------- 1 | from .vbpr_algorithm import VBPR 2 | -------------------------------------------------------------------------------- /test/content_analyzer/field_content_production_techniques/embedding_technique/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/content_analyzer/field_content_production_techniques/test_int.json: -------------------------------------------------------------------------------- 1 | [{"Int_field": 50}] -------------------------------------------------------------------------------- /clayrs/recsys/graph_based_algorithm/page_rank/__init__.py: -------------------------------------------------------------------------------- 1 | from .nx_page_rank import NXPageRank 2 | -------------------------------------------------------------------------------- /clayrs/recsys/content_based_algorithm/index_query/__init__.py: -------------------------------------------------------------------------------- 1 | from .index_query import IndexQuery 2 | -------------------------------------------------------------------------------- /clayrs/__init__.py: -------------------------------------------------------------------------------- 1 | import nest_asyncio 2 | nest_asyncio.apply() # fix IPython multiprocessing error 3 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/information_processor/postprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | from .postprocessor import * 2 | -------------------------------------------------------------------------------- /clayrs/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .load_content import load_content_instance 2 | from .report import Report 3 | -------------------------------------------------------------------------------- /docs/mkdocs/requirements-doc.txt: -------------------------------------------------------------------------------- 1 | mkdocs-material~=9.0.15 2 | mkdocstrings-python~=0.8.3 3 | griffe~=0.25.5 -------------------------------------------------------------------------------- /clayrs/content_analyzer/memory_interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | from .text_interface import KeywordIndex, SearchIndex 2 | -------------------------------------------------------------------------------- /clayrs/recsys/graph_based_algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | from . import page_rank 2 | 3 | from .page_rank import * 4 | 5 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/evaluation/eval_model.md: -------------------------------------------------------------------------------- 1 | # Eval Model class 2 | 3 | ::: clayrs.evaluation.eval_model 4 | handler: python 5 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/img/colab_examples_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/docs/mkdocs/docs/img/colab_examples_1.png -------------------------------------------------------------------------------- /docs/mkdocs/docs/img/colab_examples_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/docs/mkdocs/docs/img/colab_examples_2.png -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | coverage: 2 | status: 3 | project: 4 | default: 5 | target: auto 6 | threshold: 3% 7 | patch: off 8 | -------------------------------------------------------------------------------- /test/utils/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../clayrs/'))) 4 | -------------------------------------------------------------------------------- /datasets/ml-100k_extra_small/users_extra_small.csv: -------------------------------------------------------------------------------- 1 | user_id,age,gender,occupation,zip_code 2 | 1,24,M,technician,85711 3 | 20,42,F,homemaker,95660 4 | -------------------------------------------------------------------------------- /test/content_analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.path.abspath( os.path.join(os.path.dirname(__file__), '../clayrs/'))) -------------------------------------------------------------------------------- /test/test_files/complex_contents/index/_MAIN_2.toc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/index/_MAIN_2.toc -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/1.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/1.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/10.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/10.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/11.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/11.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/12.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/12.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/13.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/13.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/14.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/14.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/15.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/15.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/16.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/16.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/17.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/17.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/18.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/18.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/19.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/19.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/2.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/2.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/20.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/20.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/21.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/21.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/22.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/22.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/23.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/23.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/24.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/24.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/25.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/25.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/26.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/26.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/27.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/27.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/28.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/28.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/29.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/29.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/3.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/3.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/30.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/30.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/31.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/31.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/32.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/32.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/33.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/33.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/34.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/34.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/35.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/35.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/36.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/36.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/37.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/37.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/38.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/38.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/39.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/39.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/4.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/4.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/40.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/40.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/41.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/41.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/42.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/42.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/43.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/43.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/44.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/44.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/45.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/45.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/46.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/46.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/47.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/47.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/48.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/48.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/49.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/49.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/5.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/5.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/50.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/50.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/51.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/51.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/52.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/52.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/53.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/53.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/54.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/54.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/55.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/55.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/56.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/56.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/57.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/57.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/58.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/58.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/59.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/59.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/6.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/6.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/60.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/60.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/61.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/61.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/62.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/62.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/63.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/63.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/64.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/64.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/65.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/65.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/66.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/66.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/67.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/67.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/68.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/68.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/69.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/69.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/7.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/7.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/70.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/70.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/8.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/8.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/users_codified/9.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/9.xz -------------------------------------------------------------------------------- /test/test_files/test_embedding_models/ri_model.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/ri_model.model -------------------------------------------------------------------------------- /clayrs/recsys/content_based_algorithm/centroid_vector/__init__.py: -------------------------------------------------------------------------------- 1 | from .centroid_vector import CentroidVector 2 | from .similarities import CosineSimilarity 3 | -------------------------------------------------------------------------------- /test/test_files/test_embedding_models/doc2vec_model.kv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/doc2vec_model.kv -------------------------------------------------------------------------------- /test/test_files/test_embedding_models/fasttext_model.kv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/fasttext_model.kv -------------------------------------------------------------------------------- /test/test_files/test_embedding_models/word2vec_model.kv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/word2vec_model.kv -------------------------------------------------------------------------------- /clayrs/content_analyzer/information_processor/visual_preprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | from .torch_builtin_transformer import * 2 | from .torch_builtin_augmenter import * 3 | -------------------------------------------------------------------------------- /clayrs/recsys/graphs/__init__.py: -------------------------------------------------------------------------------- 1 | from .graph import UserNode, ItemNode, PropertyNode 2 | 3 | from .nx_implementation import * 4 | from .feature_selection import * 5 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/index_interface.md: -------------------------------------------------------------------------------- 1 | # Index interface 2 | 3 | ::: clayrs.content_analyzer.memory_interfaces.text_interface 4 | handler: python 5 | -------------------------------------------------------------------------------- /test/test_files/random_tsv.tsv: -------------------------------------------------------------------------------- 1 | listen improve differ 2 | visitor meant kind 3 | basis climb honor 4 | simple vote closer 5 | blind finger pencil 6 | clock energy shape -------------------------------------------------------------------------------- /test/test_files/test_embedding_models/lsa/lsa_model.model: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/lsa/lsa_model.model -------------------------------------------------------------------------------- /test/content_analyzer/content_representation/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | sys.path.insert(0, os.path.abspath( os.path.join(os.path.dirname(__file__), '../clayrs/'))) -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0112281.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112281.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0112302.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112302.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0112346.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112346.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0112453.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112453.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0112641.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112641.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0112760.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112760.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0112896.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112896.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113041.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113041.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113101.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113101.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113189.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113189.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113228.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113228.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113277.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113277.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113497.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113497.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113845.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113845.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0113987.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113987.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0114319.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114319.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0114388.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114388.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0114576.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114576.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0114709.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114709.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/movies_codified/tt0114885.xz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114885.xz -------------------------------------------------------------------------------- /test/test_files/complex_contents/index/MAIN_hlk83r2qer820iyx.seg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/index/MAIN_hlk83r2qer820iyx.seg -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-detectable=true 2 | *.xml linguist-detectable=false 3 | *.html linguist-detectable=false 4 | *.js linguist-detectable=false 5 | *.css linguist-detectable=false -------------------------------------------------------------------------------- /test/test_files/test_embedding_models/lsa/lsa_model.model.projection: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/lsa/lsa_model.model.projection -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from . import embedding_learner 2 | from . import embedding_loader 3 | 4 | from .embedding_learner import * 5 | from .embedding_loader import * 6 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/utils/report.md: -------------------------------------------------------------------------------- 1 | # Report class 2 | 3 | ::: clayrs.utils.Report 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /clayrs/evaluation/exceptions.py: -------------------------------------------------------------------------------- 1 | class NotEnoughUsers(Exception): 2 | """ 3 | Exception to raise when DeltaGap tries to split n_users in n_groups but n_users < n_groups 4 | """ 5 | pass 6 | -------------------------------------------------------------------------------- /clayrs/recsys/graphs/feature_selection/exceptions.py: -------------------------------------------------------------------------------- 1 | class FeatureSelectionException(Exception): 2 | """ 3 | Generic exception used inside the FeatureSelectionAlgorithm 4 | """ 5 | pass 6 | -------------------------------------------------------------------------------- /clayrs/recsys/graphs/nx_implementation/__init__.py: -------------------------------------------------------------------------------- 1 | from .nx_bipartite_graphs import NXBipartiteGraph 2 | from .nx_tripartite_graphs import NXTripartiteGraph 3 | from .nx_full_graphs import NXFullGraph 4 | -------------------------------------------------------------------------------- /test/test_files/test_images/images_files/o-neill-dress-black-and-white-164-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/o-neill-dress-black-and-white-164-1.jpg -------------------------------------------------------------------------------- /test/test_files/test_images/images_files/wildfox-floral-print-leggings-357-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/wildfox-floral-print-leggings-357-1.jpg -------------------------------------------------------------------------------- /clayrs/content_analyzer/ratings_manager/__init__.py: -------------------------------------------------------------------------------- 1 | from .score_processor import NumberNormalizer 2 | from .ratings import Ratings, Rank, Prediction 3 | from .sentiment_analysis import TextBlobSentimentAnalysis 4 | -------------------------------------------------------------------------------- /test/test_files/test_images/images_files/anthropologie-skirt-light-pink-434-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/anthropologie-skirt-light-pink-434-1.jpg -------------------------------------------------------------------------------- /datasets/ml-100k_extra_small/ratings_extra_small.csv: -------------------------------------------------------------------------------- 1 | user_id,item_id,rating,timestamp 2 | 1,61,4,878542420 3 | 1,189,3,888732928 4 | 1,33,4,878542699 5 | 20,288,1,879667584 6 | 20,208,2,879669401 7 | 20,11,2,879669401 8 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | dir_test_files = os.path.join(os.path.dirname(__file__), 'test_files') 5 | dir_root_repo = Path(os.path.join(os.path.dirname(__file__), '..')).resolve() 6 | -------------------------------------------------------------------------------- /test/test_files/test_images/images_files/haute-hippie-top-white-and-black-1015-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/haute-hippie-top-white-and-black-1015-1.jpg -------------------------------------------------------------------------------- /test/test_files/test_images/images_files/elizabeth-and-james-top-ecru-and-pink-81-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/elizabeth-and-james-top-ecru-and-pink-81-1.jpg -------------------------------------------------------------------------------- /clayrs/recsys/content_based_algorithm/classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from .classifier_recommender import ClassifierRecommender 2 | from .classifiers import SkSVC, SkKNN, SkRandomForest, SkLogisticRegression, SkDecisionTree, SkGaussianProcess 3 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/methodology/all_items.md: -------------------------------------------------------------------------------- 1 | # All Items methodology 2 | 3 | ::: clayrs.recsys.AllItemsMethodology 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/graph_based/graph_based_recsys.md: -------------------------------------------------------------------------------- 1 | # Graph Based RecSys 2 | 3 | ::: clayrs.recsys.recsys.GraphBasedRS 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/methodology/test_items.md: -------------------------------------------------------------------------------- 1 | # Test Items methodology 2 | 3 | ::: clayrs.recsys.TestItemsMethodology 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/partitioning/kfold.md: -------------------------------------------------------------------------------- 1 | # KFold partitioning technique 2 | 3 | ::: clayrs.recsys.KFoldPartitioning 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/from_npy.md: -------------------------------------------------------------------------------- 1 | # Import from NPY 2 | 3 | ::: clayrs.content_analyzer.FromNPY 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/methodology/test_ratings.md: -------------------------------------------------------------------------------- 1 | # Test Ratings methodology 2 | 3 | ::: clayrs.recsys.TestRatingsMethodology 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/partitioning/bootstrap.md: -------------------------------------------------------------------------------- 1 | # KFold partitioning technique 2 | 3 | ::: clayrs.recsys.BootstrapPartitioning 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/partitioning/hold_out.md: -------------------------------------------------------------------------------- 1 | # HoldOut partitioning technique 2 | 3 | ::: clayrs.recsys.HoldOutPartitioning 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | pragma: no cover 4 | def __repr__ 5 | def __str__ 6 | def __init__ 7 | raise NotImplementedError 8 | pass 9 | if __name__ == .__main__.: 10 | from 11 | import 12 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/content_based/content_based_recsys.md: -------------------------------------------------------------------------------- 1 | # Content Based RecSys 2 | 3 | ::: clayrs.recsys.recsys.ContentBasedRS 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/methodology/training_items.md: -------------------------------------------------------------------------------- 1 | # Training Items methodology 2 | 3 | ::: clayrs.recsys.TrainingItemsMethodology 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_loader/__init__.py: -------------------------------------------------------------------------------- 1 | from .gensim import Gensim 2 | from .transformer import Transformers, BertTransformers, T5Transformers 3 | from .sbert import Sbert 4 | from .vector_strategy import SumStrategy, CatStrategy 5 | -------------------------------------------------------------------------------- /clayrs/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from . import eval_pipeline_modules 2 | from . import metrics 3 | 4 | from .eval_pipeline_modules import * 5 | from .metrics import * 6 | from .eval_model import EvalModel 7 | from .statistical_test import Ttest, Wilcoxon 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/methodology/abstract_methodology.md: -------------------------------------------------------------------------------- 1 | # Abstract methodology class 2 | 3 | ::: clayrs.recsys.methodology.Methodology 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /clayrs/recsys/graphs/feature_selection/__init__.py: -------------------------------------------------------------------------------- 1 | from .feature_selection_alg import TopKPageRank, TopKDegreeCentrality, TopKEigenVectorCentrality 2 | from .feature_selection_fn import feature_selector 3 | from .exceptions import FeatureSelectionException 4 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/partitioning/abstract_partitioning.md: -------------------------------------------------------------------------------- 1 | # Abstract Partitioning class 2 | 3 | ::: clayrs.recsys.partitioning.Partitioning 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/information_processor/__init__.py: -------------------------------------------------------------------------------- 1 | from .nltk_processor import NLTK 2 | from .spacy_processor import Spacy 3 | from .ekphrasis_processor import Ekphrasis 4 | 5 | 6 | from .visual_preprocessors import * 7 | from .postprocessors import * 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/raw_sources.md: -------------------------------------------------------------------------------- 1 | # Raw Source Wrappers 2 | 3 | ::: clayrs.content_analyzer.raw_information_source 4 | handler: python 5 | options: 6 | filters: 7 | - "!^_[^_]" 8 | - "!^RawInformationSource$" 9 | -------------------------------------------------------------------------------- /clayrs/recsys/content_based_algorithm/regressor/__init__.py: -------------------------------------------------------------------------------- 1 | from .linear_predictor import LinearPredictor 2 | from .regressors import SkLinearRegression, SkRidge, SkBayesianRidge, SkSGDRegressor, SkARDRegression,\ 3 | SkHuberRegressor, SkPassiveAggressiveRegressor 4 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/information_preprocessors/textual_preprocessors/nltk.md: -------------------------------------------------------------------------------- 1 | # NLTK Preprocessor 2 | 3 | ::: clayrs.content_analyzer.NLTK 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/information_preprocessors/textual_preprocessors/spacy.md: -------------------------------------------------------------------------------- 1 | # Spacy preprocessor 2 | 3 | ::: clayrs.content_analyzer.Spacy 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/exogenous_techniques/babelfy.md: -------------------------------------------------------------------------------- 1 | # Properties from DBPedia ontology 2 | 3 | ::: clayrs.content_analyzer.BabelPyEntityLinking 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/content_based/content_based_algorithms/index_query.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ::: clayrs.recsys.content_based_algorithm.index_query.index_query.IndexQuery 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/exogenous_techniques/dbpedia.md: -------------------------------------------------------------------------------- 1 | # Properties from DBPedia ontology 2 | 3 | ::: clayrs.content_analyzer.DBPediaMappingTechnique 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/content_based/visual_based_algorithms/vbpr.md: -------------------------------------------------------------------------------- 1 | # Centroid Vector 2 | 3 | ::: clayrs.recsys.visual_based_algorithm.vbpr.vbpr_algorithm.VBPR 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/information_preprocessors/textual_preprocessors/ekphrasis.md: -------------------------------------------------------------------------------- 1 | # Ekphrasis Preprocessor 2 | 3 | ::: clayrs.content_analyzer.Ekphrasis 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/graph_based/graph_based_algorithms/nx_pagerank.md: -------------------------------------------------------------------------------- 1 | # Page Rank 2 | 3 | 4 | ::: clayrs.recsys.graph_based_algorithm.page_rank.nx_page_rank.NXPageRank 5 | handler: python 6 | options: 7 | show_root_toc_entry: true 8 | show_root_heading: true -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/graph_based/graphs/nx_bipartite.md: -------------------------------------------------------------------------------- 1 | # Bipartite Graph 2 | 3 | ::: clayrs.recsys.graphs.nx_implementation.nx_bipartite_graphs.NXBipartiteGraph 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/exogenous_techniques/properties_from_dataset.md: -------------------------------------------------------------------------------- 1 | # Properties from local dataset 2 | 3 | ::: clayrs.content_analyzer.PropertiesFromDataset 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /test/test_files/d2v_test_data.json: -------------------------------------------------------------------------------- 1 | [{"id_field":"01","doc_field":"I love machine learning. Its awesome."}, 2 | {"id_field":"02","doc_field":"I love coding in python"}, 3 | {"id_field":"03","doc_field":"I love building chatbots"}, 4 | {"id_field":"04","doc_field":"they chat amagingly well"}] -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/original_data.md: -------------------------------------------------------------------------------- 1 | # Original Data 2 | 3 | ::: clayrs.content_analyzer.OriginalData 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | members: none 9 | -------------------------------------------------------------------------------- /clayrs/recsys/content_based_algorithm/__init__.py: -------------------------------------------------------------------------------- 1 | from . import centroid_vector 2 | from . import classifier 3 | from . import index_query 4 | from . import regressor 5 | 6 | from .centroid_vector import * 7 | from .classifier import * 8 | from .index_query import * 9 | from .regressor import * 10 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/synset_df_frequency.md: -------------------------------------------------------------------------------- 1 | # Synset Document Frequency 2 | 3 | ::: clayrs.content_analyzer.PyWSDSynsetDocumentFrequency 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/evaluation/metrics/plot_metrics.md: -------------------------------------------------------------------------------- 1 | # Plot metrics 2 | 3 | Plot metrics save a plot in the chosen output directory 4 | 5 | ::: clayrs.evaluation.metrics.plot_metrics 6 | handler: python 7 | options: 8 | filters: 9 | - "!^_[^_]" 10 | - "!^PlotMetric$" 11 | - "!.*def.*" -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_learner/__init__.py: -------------------------------------------------------------------------------- 1 | from .doc2vec import GensimDoc2Vec 2 | from .fasttext import GensimFastText 3 | from .latent_semantic_analysis import GensimLatentSemanticAnalysis 4 | from .random_indexing import GensimRandomIndexing 5 | from .word2vec import GensimWord2Vec 6 | from .lda import GensimLDA 7 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/field_content_production_techniques/visual_techniques/__init__.py: -------------------------------------------------------------------------------- 1 | from .low_level_techniques import SkImageHogDescriptor, SkImageCannyEdgeDetector, SkImageSIFT, ColorsHist, \ 2 | SkImageLBP, ColorQuantization, CustomFilterConvolution 3 | from .high_level_techniques import PytorchImageModels, CaffeImageModels 4 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/evaluation/metrics/error_metrics.md: -------------------------------------------------------------------------------- 1 | # Error metrics 2 | 3 | Error metrics evaluate 'how wrong' the recommender system was in predicting a rating 4 | 5 | ::: clayrs.evaluation.metrics.error_metrics 6 | handler: python 7 | options: 8 | filters: 9 | - "!^_[^_]" 10 | - "!^ErrorMetric$" -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/visual_techniques/high_level_visual.md: -------------------------------------------------------------------------------- 1 | # High level techniques 2 | 3 | ::: clayrs.content_analyzer.field_content_production_techniques.visual_techniques.high_level_techniques 4 | handler: python 5 | options: 6 | heading_level: 3 7 | filters: 8 | - '!^HighLevelVisual$' 9 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/visual_techniques/low_level_visual.md: -------------------------------------------------------------------------------- 1 | # Low level techniques 2 | 3 | ::: clayrs.content_analyzer.field_content_production_techniques.visual_techniques.low_level_techniques 4 | handler: python 5 | options: 6 | heading_level: 3 7 | filters: 8 | - '!^LowLevelVisual$' 9 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/evaluation/metrics/ranking_metrics.md: -------------------------------------------------------------------------------- 1 | # Ranking metrics 2 | 3 | Ranking metrics evaluate the quality of the recommendation lists 4 | 5 | ::: clayrs.evaluation.metrics.ranking_metrics 6 | handler: python 7 | options: 8 | filters: 9 | - "!^_[^_]" 10 | - "!^RankingMetric$" 11 | - "!.*def.*" -------------------------------------------------------------------------------- /clayrs/content_analyzer/field_content_production_techniques/embedding_technique/__init__.py: -------------------------------------------------------------------------------- 1 | from .combining_technique import Centroid, Sum, SingleToken 2 | from .embedding_technique import WordEmbeddingTechnique, SentenceEmbeddingTechnique, DocumentEmbeddingTechnique, \ 3 | Word2SentenceEmbedding, Sentence2DocEmbedding, Word2DocEmbedding, Sentence2WordEmbedding 4 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/ratings/ratings.md: -------------------------------------------------------------------------------- 1 | # Ratings class 2 | 3 | The `Ratings` class is the main responsible for importing a dataset containing interactions between users and items 4 | 5 | ::: clayrs.content_analyzer.ratings_manager.Ratings 6 | handler: python 7 | options: 8 | show_root_toc_entry: true 9 | show_root_heading: true 10 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/field_content_production_techniques/__init__.py: -------------------------------------------------------------------------------- 1 | from . import embedding_technique 2 | 3 | from .embedding_technique import * 4 | from .visual_techniques import * 5 | from .tf_idf import WhooshTfIdf, SkLearnTfIdf 6 | from .field_content_production_technique import OriginalData, FromNPY 7 | from .synset_document_frequency import PyWSDSynsetDocumentFrequency 8 | -------------------------------------------------------------------------------- /clayrs/utils/const.py: -------------------------------------------------------------------------------- 1 | import os 2 | from clayrs.utils.custom_logger import get_custom_logger 3 | 4 | THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | root_path = os.path.join(THIS_DIR, '../../') 6 | contents_path = os.path.join(root_path, 'contents/') 7 | datasets_path = os.path.join(root_path, 'datasets/') 8 | 9 | logger = get_custom_logger('custom_logger') 10 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/config.md: -------------------------------------------------------------------------------- 1 | # Content Analyzer Config 2 | 3 | ::: clayrs.content_analyzer.config 4 | handler: python 5 | 6 | 7 | ## Content Analyzer Class 8 | 9 | ::: clayrs.content_analyzer.ContentAnalyzer 10 | handler: python 11 | options: 12 | heading_level: 3 13 | show_root_toc_entry: true 14 | show_root_heading: true 15 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/experiment.md: -------------------------------------------------------------------------------- 1 | # Experiment class 2 | 3 | ::: clayrs.recsys.ContentBasedExperiment 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | ::: clayrs.recsys.GraphBasedExperiment 10 | handler: python 11 | options: 12 | show_root_toc_entry: true 13 | show_root_heading: true 14 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/evaluation/metrics/fairness_metrics.md: -------------------------------------------------------------------------------- 1 | # Fairness metrics 2 | 3 | Fairness metrics evaluate how unbiased the recommendation lists are (e.g. unbiased towards popularity of the items) 4 | 5 | ::: clayrs.evaluation.metrics.fairness_metrics 6 | handler: python 7 | options: 8 | filters: 9 | - "!^_[^_]" 10 | - "!^FairnessMetric$" 11 | - "!.*def.*" -------------------------------------------------------------------------------- /docs/mkdocs/docs/javascripts/mathjax.js: -------------------------------------------------------------------------------- 1 | window.MathJax = { 2 | tex: { 3 | inlineMath: [["\\(", "\\)"]], 4 | displayMath: [["\\[", "\\]"]], 5 | processEscapes: true, 6 | processEnvironments: true 7 | }, 8 | options: { 9 | ignoreHtmlClass: ".*|", 10 | processHtmlClass: "arithmatex" 11 | } 12 | }; 13 | 14 | document$.subscribe(() => { 15 | MathJax.typesetPromise() 16 | }) -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/tfidf.md: -------------------------------------------------------------------------------- 1 | # TfIdf 2 | 3 | ::: clayrs.content_analyzer.SkLearnTfIdf 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | ::: clayrs.content_analyzer.WhooshTfIdf 10 | handler: python 11 | options: 12 | show_root_toc_entry: true 13 | show_root_heading: true -------------------------------------------------------------------------------- /test/test_files/users_info.json: -------------------------------------------------------------------------------- 1 | [{"user_id": "1", "name": "Roberto", "birth_date": "10-09-1998"}, 2 | {"user_id": "2", "name": "Mattia", "birth_date": "11-10-1996"}, 3 | {"user_id": "3", "name": "Francesco", "birth_date": "01-03-1995"}, 4 | {"user_id": "4", "name": "Carlo", "birth_date": "07-09-1992"}, 5 | {"user_id": "5", "name": "Pasquale", "birth_date": "13-11-1998"}, 6 | {"user_id": "6", "name": "Sergio", "birth_date": "06-05-1998"}] -------------------------------------------------------------------------------- /test/test_files/test_dbpedia/movies_info_reduced.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"Title": "Jumanji", "Year": "1995", "Rated": "PG", "Released": "15 Dec 1995", "Budget_source": "6.5E7", "cinematography": "", "only_local": "", "wiki_id": "3700174", "runtime (m)": "104.0"}, 3 | {"Title": "Inception", "Budget_source": "1.6E8", "only_local": ""}, 4 | {"Title": "Demon Island"}, 5 | {"Title": "Léon: The Professional"}, 6 | {"Title": "not_exiiiiissstss"} 7 | ] -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/graph_based/graphs/nx_full.md: -------------------------------------------------------------------------------- 1 | # Full Graph 2 | 3 | Please remember that this class is a subclass of [NXTripartiteGraph][clayrs.recsys.NXTripartiteGraph], 4 | so it inherits all its methods. You can check their documentation as well! 5 | 6 | ::: clayrs.recsys.graphs.nx_implementation.nx_full_graphs.NXFullGraph 7 | handler: python 8 | options: 9 | show_root_toc_entry: true 10 | show_root_heading: true -------------------------------------------------------------------------------- /test/test_files/test_ratings/ratings_1591277020.csv: -------------------------------------------------------------------------------- 1 | 01,a,0.2333333333333333,1234567,not so good,I expected more from this product,2.0 2 | 01,b,0.8333333333333334,1234567,perfect,I love this product,5.0 3 | 01,c,0.8666666666666667,1234567,awesome,The perfect gift for my darling,4.0 4 | 02,a,-0.3666666666666667,1234567,a disaster,Too much expensive ,1.0 5 | 02,c,0.6,1234567,really good,A good compromise,3.5 6 | 03,b,0.6666666666666666,1234567,Awesome,,5.0 7 | -------------------------------------------------------------------------------- /clayrs/evaluation/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .classification_metrics import Precision, PrecisionAtK, RPrecision, Recall, RecallAtK, \ 2 | FMeasure, FMeasureAtK 3 | from .error_metrics import MAE, MSE, RMSE 4 | from .fairness_metrics import GiniIndex, DeltaGap, PredictionCoverage, CatalogCoverage 5 | from .plot_metrics import PopRatioProfileVsRecs, PopRecsCorrelation, LongTailDistr 6 | from .ranking_metrics import NDCG, NDCGAtK, MRR, MRRAtK, Correlation, MAP 7 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/ratings/score_processors.md: -------------------------------------------------------------------------------- 1 | # Score Processors 2 | 3 | ::: clayrs.content_analyzer.ratings_manager.NumberNormalizer 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | ::: clayrs.content_analyzer.ratings_manager.TextBlobSentimentAnalysis 10 | handler: python 11 | options: 12 | show_root_toc_entry: true 13 | show_root_heading: true -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/graph_based/graphs/nx_tripartite.md: -------------------------------------------------------------------------------- 1 | # Tripartite Graph 2 | 3 | Please remember that this class is a subclass of [NXBipartiteGraph][clayrs.recsys.NXBipartiteGraph], 4 | so it inherits all its methods. You can check their documentation as well! 5 | 6 | ::: clayrs.recsys.graphs.nx_implementation.nx_tripartite_graphs.NXTripartiteGraph 7 | handler: python 8 | options: 9 | show_root_toc_entry: true 10 | show_root_heading: true 11 | -------------------------------------------------------------------------------- /clayrs/recsys/graphs/graph_metrics.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class GraphMetrics(ABC): 5 | """ 6 | Interface for graph metrics 7 | """ 8 | 9 | @abstractmethod 10 | def degree_centrality(self): 11 | raise NotImplementedError 12 | 13 | @abstractmethod 14 | def closeness_centrality(self): 15 | raise NotImplementedError 16 | 17 | @abstractmethod 18 | def dispersion(self): 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /test/utils/test_load_content.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | from clayrs.utils.load_content import load_content_instance 4 | from test import dir_test_files 5 | 6 | movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/') 7 | 8 | 9 | class Test(TestCase): 10 | def test_load_content_instance(self): 11 | self.assertIsNone(load_content_instance("not_existent", "invalid_item")) 12 | self.assertIsNotNone(load_content_instance(movies_dir, "tt0112281")) 13 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/evaluation/metrics/classification_metrics.md: -------------------------------------------------------------------------------- 1 | # Classification metrics 2 | 3 | A classification metric uses confusion matrix terminology (true positive, false positive, true negative, false negative) 4 | to classify each item predicted, and in general it needs a way to discern relevant items from non-relevant items for 5 | users 6 | 7 | ::: clayrs.evaluation.metrics.classification_metrics 8 | handler: python 9 | options: 10 | filters: 11 | - "!^_[^_]" 12 | - "!^ClassificationMetric$" -------------------------------------------------------------------------------- /docs/mkdocs/docs/evaluation/statistical_tests/paired.md: -------------------------------------------------------------------------------- 1 | # Paired statistical tests 2 | 3 | ::: clayrs.evaluation.statistical_test.PairedTest 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | ::: clayrs.evaluation.statistical_test.Ttest 10 | handler: python 11 | options: 12 | show_root_toc_entry: true 13 | show_root_heading: true 14 | 15 | ::: clayrs.evaluation.statistical_test.Wilcoxon 16 | handler: python 17 | options: 18 | show_root_toc_entry: true 19 | show_root_heading: true -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/information_preprocessors/visual_preprocessors/torch_preprocessors.md: -------------------------------------------------------------------------------- 1 | # Torch Preprocessors 2 | 3 | ## Torch transformers 4 | 5 | ::: clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_transformer 6 | handler: python 7 | options: 8 | heading_level: 3 9 | filters: 10 | - '!^TorchBuiltInTransformer$' 11 | 12 | --- 13 | 14 | ## Torch augmenters 15 | 16 | ::: clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_augmenter 17 | handler: python 18 | options: 19 | heading_level: 3 20 | -------------------------------------------------------------------------------- /clayrs/recsys/__init__.py: -------------------------------------------------------------------------------- 1 | from . import content_based_algorithm 2 | from . import graph_based_algorithm 3 | from . import graphs 4 | 5 | from .content_based_algorithm import * 6 | from .graph_based_algorithm import * 7 | from .graphs import * 8 | from .visual_based_algorithm import * 9 | from .recsys import ContentBasedRS, GraphBasedRS 10 | from .partitioning import KFoldPartitioning, HoldOutPartitioning, BootstrapPartitioning 11 | from .methodology import TestRatingsMethodology, TestItemsMethodology, TrainingItemsMethodology, AllItemsMethodology 12 | from .experiment import ContentBasedExperiment, GraphBasedExperiment 13 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/content_based/content_based_algorithms/linear_predictor.md: -------------------------------------------------------------------------------- 1 | # Linear Predictor 2 | 3 | ::: clayrs.recsys.content_based_algorithm.regressor.linear_predictor.LinearPredictor 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | --- 10 | 11 | ## Regressors Implemented 12 | 13 | The following are the regressors you can use in the `regressor` parameter of the `LinearPredictor` class 14 | 15 | 16 | ::: clayrs.recsys.content_based_algorithm.regressor.regressors 17 | handler: python 18 | options: 19 | heading_level: 3 20 | filters: 21 | - '!^Regressor$' -------------------------------------------------------------------------------- /test/test_files/test_images/tradesy_small_local_relative_paths.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"imagePath": "test/test_files/test_images/images_files/haute-hippie-top-white-and-black-1015-1.jpg", "itemID": "1015"}, 3 | {"imagePath": "test/test_files/test_images/images_files/anthropologie-skirt-light-pink-434-1.jpg", "itemID": "434"}, 4 | {"imagePath": "test/test_files/test_images/images_files/wildfox-floral-print-leggings-357-1.jpg", "itemID": "357"}, 5 | {"imagePath": "test/test_files/test_images/images_files/o-neill-dress-black-and-white-164-1.jpg", "itemID": "164"}, 6 | {"imagePath": "test/test_files/test_images/images_files/elizabeth-and-james-top-ecru-and-pink-81-1.jpg", "itemID": "81"} 7 | ] -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/content_based/content_based_algorithms/centroid_vector.md: -------------------------------------------------------------------------------- 1 | # Centroid Vector 2 | 3 | ::: clayrs.recsys.content_based_algorithm.centroid_vector.CentroidVector 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | --- 10 | 11 | ## Similarities implemented 12 | 13 | The following are similarities you can use in the `similarity` parameter of the `CentroidVector` class 14 | 15 | ::: clayrs.recsys.content_based_algorithm.centroid_vector.similarities.CosineSimilarity 16 | handler: python 17 | options: 18 | heading_level: 3 19 | show_root_toc_entry: true 20 | show_root_heading: true 21 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/content_based/content_based_algorithms/classifier_recommender.md: -------------------------------------------------------------------------------- 1 | # Classifier Recommender 2 | 3 | ::: clayrs.recsys.content_based_algorithm.classifier.classifier_recommender.ClassifierRecommender 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | --- 10 | 11 | ## Classifiers Implemented 12 | 13 | The following are the classifiers you can use in the `classifier` parameter of the `ClassifierRecommender` class 14 | 15 | 16 | ::: clayrs.recsys.content_based_algorithm.classifier.classifiers 17 | handler: python 18 | options: 19 | heading_level: 3 20 | filters: 21 | - '!^Classifier$' -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /test/content_analyzer/utils/test_check_tokenization.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from clayrs.content_analyzer.utils.check_tokenization import check_not_tokenized, check_tokenized, tokenize_in_sentences 3 | 4 | 5 | class Test(TestCase): 6 | def test_check_tokenized(self): 7 | str_ = 'abcd efg' 8 | list_ = ['abcd', 'efg'] 9 | check_tokenized(str_) 10 | check_tokenized(list_) 11 | check_not_tokenized(str_) 12 | check_not_tokenized(list_) 13 | 14 | def test_tokenize_sentence(self): 15 | 16 | phrases = "Ciao, questa è una prova. Anche questa. And this is the third" 17 | result = tokenize_in_sentences(phrases) 18 | 19 | self.assertTrue(len(result) == 3) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas~=1.5.3 2 | numpy~=1.22.4 3 | gensim~=4.3.0 4 | nltk~=3.5 5 | babelpy~=1.0.1 6 | mysql~=0.0.3 7 | mysql-connector-python~=8.0.20 8 | scikit-learn==1.0.2 9 | SPARQLWrapper~=1.8.5 10 | textblob~=0.15.3 11 | matplotlib~=3.5.0 12 | pywsd~=1.2.4 13 | wn~=0.0.23 14 | networkx~=2.6.3 15 | whoosh~=2.7.4 16 | sentence-transformers~=1.2.0 17 | colorama~=0.4.4 18 | tqdm~=4.62.2 19 | spacy~=3.2.1 20 | ekphrasis~=0.5.4 21 | scipy~=1.7.3 22 | torch~=1.13.0 23 | transformers~=4.29.2 24 | pyaml~=21.10.1 25 | PyYAML~=6.0.1 26 | distex~=0.7.1 27 | nest-asyncio~=1.5.5 28 | validators~=0.20.0 29 | requests~=2.28.2 30 | timm~=0.6.12 31 | scikit-image~=0.19.3 32 | torchvision~=0.14.1 33 | numpy-indexed~=0.3.5 34 | Pillow~=9.4.0 35 | opencv-python~=4.7.0.72 36 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /test/test_files/test_import_ratings.json: -------------------------------------------------------------------------------- 1 | [{"user_id": "01", "item_id": "a", "review_title": "not so good","text": "I expected more from this product", "stars": 2.0, "timestamp": 1234567}, 2 | {"user_id": "01", "item_id": "b", "review_title": "perfect","text": "I love this product", "stars": 5.0, "timestamp": 1234567}, 3 | {"user_id": "02", "item_id": "a", "review_title": "a disaster","text": "Too much expensive ", "stars": 1.0, "timestamp": 1234567}, 4 | {"user_id": "02", "item_id": "c", "review_title": "really good","text": "A good compromise", "stars": 3.5, "timestamp": 1234567}, 5 | {"user_id": "03", "item_id": "b", "review_title": "Awesome","text": "", "stars": 5.0, "timestamp": 1234567}, 6 | {"user_id": "01", "item_id": "c", "review_title": "awesome","text": "The perfect gift for my darling", "stars": 4.0, "timestamp": 1234567} 7 | ] 8 | -------------------------------------------------------------------------------- /test/content_analyzer/utils/test_id_merger.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from clayrs.content_analyzer.utils.id_merger import id_values_merger 4 | 5 | 6 | class Test(TestCase): 7 | def test_id_merger(self): 8 | self.assertEqual(id_values_merger('aaa'), 'aaa', "Must return a string value") 9 | self.assertEqual(id_values_merger(['aaa', 'bbb']), 'aaa_bbb', "Must return a string value like this aaa_bbb") 10 | self.assertEqual(id_values_merger(123), '123', "Must return a string value") 11 | self.assertEqual(id_values_merger([123, 124]), '123_124', "Must return a string value like this 123_124") 12 | self.assertEqual(id_values_merger([123, "aaa"]), '123_aaa', "Must return a string value like 123_aaa") 13 | with self.assertRaises(TypeError): 14 | id_values_merger({1: 1, 2: 2}) 15 | -------------------------------------------------------------------------------- /.github/workflows/docs_building.yml: -------------------------------------------------------------------------------- 1 | name: Build documentation 2 | on: 3 | push: 4 | branches: 5 | - master 6 | workflow_dispatch: 7 | 8 | jobs: 9 | main: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - 14 | name: Checkout Repository 15 | uses: actions/checkout@v2 16 | with: 17 | fetch-depth: 0 18 | - 19 | name: Setup python 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.8 #install the python needed 23 | - 24 | name: Setup documentation requirements 25 | run: | 26 | pip install --upgrade pip 27 | pip install -r docs/mkdocs/requirements-doc.txt 28 | - 29 | name: Deploy documentation 30 | run: | 31 | cd docs/mkdocs 32 | mkdocs gh-deploy --clean 33 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | from . import content_representation 2 | from . import embeddings 3 | from . import field_content_production_techniques 4 | from . import information_processor 5 | from . import memory_interfaces 6 | from . import ratings_manager 7 | 8 | from .content_representation import * 9 | from .embeddings import * 10 | from .field_content_production_techniques import * 11 | from .information_processor import * 12 | from .memory_interfaces import * 13 | from .ratings_manager import * 14 | from .config import ExogenousConfig, UserAnalyzerConfig, ItemAnalyzerConfig, FieldConfig 15 | from .content_analyzer_main import ContentAnalyzer 16 | from .exogenous_properties_retrieval import DBPediaMappingTechnique, PropertiesFromDataset, BabelPyEntityLinking 17 | from .raw_information_source import CSVFile, JSONFile, DATFile, SQLDatabase 18 | 19 | 20 | -------------------------------------------------------------------------------- /test/content_analyzer/field_content_production_techniques/test_synset_document_frequency.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import os 3 | 4 | from clayrs.content_analyzer.content_representation.content import FeaturesBagField 5 | from clayrs.content_analyzer.raw_information_source import JSONFile 6 | from clayrs.content_analyzer.field_content_production_techniques import PyWSDSynsetDocumentFrequency 7 | from test import dir_test_files 8 | 9 | file_path = os.path.join(dir_test_files, "movies_info_reduced.json") 10 | 11 | 12 | class TestSynsetDocumentFrequency(TestCase): 13 | def test_produce_content(self): 14 | technique = PyWSDSynsetDocumentFrequency() 15 | 16 | features_bag_list = technique.produce_content("Plot", [], [], JSONFile(file_path)) 17 | 18 | self.assertEqual(len(features_bag_list), 20) 19 | self.assertIsInstance(features_bag_list[0], FeaturesBagField) 20 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/graph_based/graphs/nodes.md: -------------------------------------------------------------------------------- 1 | # Nodes categories 2 | 3 | The followings are all the various category of nodes that can be added to a graph. 4 | 5 | !!! info 6 | 7 | Please note that there exists [Bipartite Graph][clayrs.recsys.NXBipartiteGraph], 8 | [Tripartite Graph][clayrs.recsys.NXTripartiteGraph] and 9 | [Full Graph][clayrs.recsys.NXFullGraph], all with their peculiarities and restrictions. 10 | 11 | Check their documentation for more! 12 | 13 | ::: clayrs.recsys.graphs.UserNode 14 | handler: python 15 | options: 16 | show_root_toc_entry: true 17 | show_root_heading: true 18 | 19 | ::: clayrs.recsys.graphs.ItemNode 20 | handler: python 21 | options: 22 | show_root_toc_entry: true 23 | show_root_heading: true 24 | 25 | ::: clayrs.recsys.graphs.PropertyNode 26 | handler: python 27 | options: 28 | show_root_toc_entry: true 29 | show_root_heading: true -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_learner/test_doc2vec.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import os 3 | import pathlib as pl 4 | 5 | from clayrs.content_analyzer.embeddings.embedding_learner.doc2vec import GensimDoc2Vec 6 | from clayrs.content_analyzer.information_processor.nltk_processor import NLTK 7 | from clayrs.content_analyzer.raw_information_source import JSONFile 8 | from test import dir_test_files 9 | 10 | file_path = os.path.join(dir_test_files, 'movies_info_reduced.json') 11 | 12 | 13 | class TestGensimDoc2Vec(TestCase): 14 | def test_fit(self): 15 | model_path = "./model_test_Doc2Vec" 16 | learner = GensimDoc2Vec(model_path, True) 17 | learner.fit(source=JSONFile(file_path), field_list=["Plot", "Genre"], preprocessor_list=[NLTK()]) 18 | model_path += ".kv" 19 | 20 | self.assertEqual(learner.get_embedding("ace").any(), True) 21 | self.assertEqual(pl.Path(model_path).resolve().is_file(), True) 22 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_learner/test_word2vec.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import os 3 | import pathlib as pl 4 | 5 | from clayrs.content_analyzer.embeddings.embedding_learner import GensimWord2Vec 6 | from clayrs.content_analyzer.information_processor.nltk_processor import NLTK 7 | from clayrs.content_analyzer.raw_information_source import JSONFile 8 | from test import dir_test_files 9 | 10 | file_path = os.path.join(dir_test_files, 'movies_info_reduced.json') 11 | 12 | 13 | class TestGensimWord2Vec(TestCase): 14 | def test_fit(self): 15 | model_path = "./model_test_Word2Vec" 16 | learner = GensimWord2Vec(model_path, True) 17 | learner.fit(source=JSONFile(file_path), field_list=["Plot", "Genre"], preprocessor_list=[NLTK()]) 18 | model_path += ".kv" 19 | 20 | self.assertEqual(learner.get_embedding("ace").any(), True) 21 | self.assertEqual(pl.Path(model_path).resolve().is_file(), True) 22 | 23 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_learner/test_fasttext.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import os 3 | import pathlib as pl 4 | 5 | from clayrs.content_analyzer.embeddings.embedding_learner import GensimFastText 6 | from clayrs.content_analyzer.information_processor.nltk_processor import NLTK 7 | from clayrs.content_analyzer.raw_information_source import JSONFile 8 | from test import dir_test_files 9 | 10 | file_path = os.path.join(dir_test_files, 'movies_info_reduced.json') 11 | 12 | 13 | class TestGensimFastText(TestCase): 14 | def test_fit(self): 15 | model_path = "./model_test_FastText" 16 | learner = GensimFastText(model_path, True) 17 | learner.fit(source=JSONFile(file_path), field_list=["Plot", "Genre"], preprocessor_list=[NLTK()]) 18 | model_path += ".kv" 19 | 20 | self.assertEqual(learner.get_embedding("ace").any(), True) 21 | self.assertEqual(pl.Path(model_path).resolve().is_file(), True) 22 | 23 | 24 | -------------------------------------------------------------------------------- /test/content_analyzer/ratings_manager/test_sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from clayrs.content_analyzer.ratings_manager.sentiment_analysis import TextBlobSentimentAnalysis 4 | from textblob import TextBlob 5 | import numpy as np 6 | 7 | 8 | class TestTextBlobSentimentAnalysis(TestCase): 9 | def test_fit(self): 10 | text_reviews = ['good item', 'it was awful', 'pretty good', 'extraordinary', 'too much expensive'] 11 | 12 | result = [TextBlobSentimentAnalysis().fit(text) for text in text_reviews] 13 | expected = [TextBlob(field_data).sentiment.polarity for field_data in text_reviews] 14 | 15 | self.assertEqual(expected, result) 16 | 17 | result_rounded = [TextBlobSentimentAnalysis(decimal_rounding=4).fit(text) for text in text_reviews] 18 | expected_rounded = [np.round(TextBlob(field_data).sentiment.polarity, 4) for field_data in text_reviews] 19 | 20 | self.assertEqual(expected_rounded, result_rounded) 21 | -------------------------------------------------------------------------------- /clayrs/recsys/algorithm.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import ABC 3 | 4 | 5 | class Algorithm(ABC): 6 | """ 7 | Abstract class for an Algorithm. 8 | 9 | Every algorithm must be able to predict or to rank, or maybe both. 10 | In case some algorithms can only do one of the two (eg. PageRank), simply implement both 11 | methods and raise the NotPredictionAlg or NotRankingAlg exception accordingly. 12 | """ 13 | __slots__ = () 14 | 15 | @abc.abstractmethod 16 | def predict(self, **kwargs): 17 | """ 18 | Method to call when score prediction must be done. 19 | 20 | If the Algorithm can't do score prediction, implement this method and raise 21 | the NotPredictionAlg exception 22 | """ 23 | raise NotImplementedError 24 | 25 | @abc.abstractmethod 26 | def rank(self, **kwargs): 27 | """ 28 | Method to call when ranking must be done. 29 | 30 | If the Algorithm can't rank, implement this method and raise the NotRankingAlg exception 31 | """ 32 | raise NotImplementedError 33 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/document_embeddings.md: -------------------------------------------------------------------------------- 1 | # Document Embeddings 2 | 3 | Via the following, you can obtain embeddings of ***document*** granularity 4 | 5 | ```python 6 | from clayrs import content_analyzer as ca 7 | 8 | # obtain document embeddings by training LDA model 9 | # on corpus of contents to complexly represent 10 | ca.DocumentEmbeddingTechnique(embedding_source=ca.GensimLDA()) 11 | ``` 12 | 13 | ::: clayrs.content_analyzer.DocumentEmbeddingTechnique 14 | handler: python 15 | options: 16 | show_root_toc_entry: true 17 | show_root_heading: true 18 | 19 | ## Document Embedding models 20 | 21 | ::: clayrs.content_analyzer.GensimLatentSemanticAnalysis 22 | handler: python 23 | options: 24 | heading_level: 3 25 | show_root_toc_entry: true 26 | show_root_heading: true 27 | 28 | ::: clayrs.content_analyzer.GensimLDA 29 | handler: python 30 | options: 31 | heading_level: 3 32 | show_root_toc_entry: true 33 | show_root_heading: true 34 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_loader/test_sbert.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | from unittest import TestCase, mock 3 | import numpy as np 4 | 5 | from clayrs.content_analyzer.embeddings import Sbert 6 | 7 | result_matrix = { 8 | 'this is a phrase': np.array([random() for _ in range(768)]), 9 | 'this is another phrase': np.array([random() for _ in range(768)]) 10 | } 11 | 12 | 13 | def encode(sentence, show_progress_bar): 14 | return result_matrix[sentence] 15 | 16 | 17 | class TestSbert(TestCase): 18 | 19 | @mock.patch('clayrs.content_analyzer.embeddings.sbert.SentenceTransformer') 20 | def test_sbert(self, mocked_model): 21 | instance = mocked_model.return_value 22 | instance.get_sentence_embedding_dimension.return_value = 768 23 | instance.encode.side_effect = encode 24 | 25 | source = Sbert() 26 | 27 | vector_size = source.get_vector_size() 28 | 29 | result = source.load(["this is a phrase", "this is another phrase"]) 30 | 31 | self.assertEqual(len(result), 2) 32 | self.assertEqual(len(result[0]), vector_size) 33 | self.assertEqual(len(result[1]), vector_size) 34 | -------------------------------------------------------------------------------- /test/utils/test_class_utils.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from clayrs.content_analyzer.config import ContentAnalyzerConfig, FieldConfig, ItemAnalyzerConfig, \ 4 | UserAnalyzerConfig 5 | from clayrs.utils.class_utils import get_all_implemented_classes, get_all_implemented_subclasses 6 | 7 | 8 | class TestClassUtils(TestCase): 9 | 10 | def test_get_all_implemented_classes(self): 11 | 12 | results = get_all_implemented_classes(ContentAnalyzerConfig) 13 | 14 | expected_results = {ItemAnalyzerConfig, UserAnalyzerConfig} 15 | self.assertEqual(results, expected_results) 16 | 17 | results = get_all_implemented_classes(FieldConfig) 18 | 19 | expected_results = {FieldConfig} 20 | self.assertEqual(results, expected_results) 21 | 22 | def test_get_all_implemented_subclasses(self): 23 | 24 | results = get_all_implemented_subclasses(ContentAnalyzerConfig) 25 | 26 | expected_results = {ItemAnalyzerConfig, UserAnalyzerConfig} 27 | self.assertEqual(results, expected_results) 28 | 29 | results = get_all_implemented_subclasses(FieldConfig) 30 | expected_results = set() 31 | 32 | self.assertEqual(results, expected_results) -------------------------------------------------------------------------------- /docs/mkdocs/docs/first_steps/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Via PIP recommended { data-toc-label="Via PIP" } 4 | 5 | *ClayRS* requires Python **3.7** or later, while package dependencies are in `requirements.txt` and are all installable 6 | via `pip`, as *ClayRS* itself. 7 | 8 | To install it execute the following command: 9 | 10 | === "Latest" 11 | 12 | ``` sh 13 | pip install clayrs 14 | ``` 15 | 16 | This will automatically install compatible versions of all dependencies. 17 | 18 | --- 19 | **Tip**: We suggest installing ClayRS (or any python package, for that matters) in a virtual environment 20 | 21 | !!! quote "" 22 | *Virtual environments are special isolated environments where all the packages and versions you install only 23 | apply to that specific environment. It’s like a private island! — but for code.* 24 | 25 | Read this [Medium article][medium] for understanding all the advantages and the [official python guide] [venv] 26 | on how to set up one 27 | 28 | [medium]: https://towardsdatascience.com/why-you-should-use-a-virtual-environment-for-every-python-project-c17dab3b0fd0 29 | [venv]: https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/ -------------------------------------------------------------------------------- /test/test_files/test_images/tradesy_small_online.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"productUrl": "https://www.tradesy.com/tops/haute-hippie-top-white-and-black-1015/?tref=category", "imageUrl": "https://item1.tradesy.com/images/item/2/tops/haute-hippie/8-m/haute-hippie-top-white-and-black-1015-1.jpg", "itemID": "1015"}, 3 | {"productUrl": "https://www.tradesy.com/bottoms/anthropologie-skirt-light-pink-434/?tref=category", "imageUrl": "https://item5.tradesy.com/images/item/2/bottoms/anthropologie/0-xs-25/anthropologie-skirt-light-pink-434-1.jpg", "itemID": "434"}, 4 | {"productUrl": "https://www.tradesy.com/bottoms/wildfox-floral-print-leggings-357/?tref=category", "imageUrl": "https://item3.tradesy.com/images/item/2/bottoms/wildfox/0-xs-25/wildfox-floral-print-leggings-357-1.jpg", "itemID": "357"}, 5 | {"productUrl": "https://www.tradesy.com/dresses/o-neill-dress-black-and-white-164/?tref=category", "imageUrl": "https://item5.tradesy.com/images/item/2/dresses/o-neill/8-m/o-neill-dress-black-and-white-164-1.jpg", "itemID": "164"}, 6 | {"productUrl": "https://www.tradesy.com/tops/elizabeth-and-james-top-ecru-and-pink-81/?tref=category", "imageUrl": "https://item2.tradesy.com/images/item/2/tops/elizabeth-and-james/0-xs/elizabeth-and-james-top-ecru-and-pink-81-1.jpg", "itemID": "81"} 7 | ] -------------------------------------------------------------------------------- /test/content_analyzer/field_content_production_techniques/test_tf_idf.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import os 3 | 4 | from clayrs.content_analyzer.content_representation.content import FeaturesBagField 5 | from clayrs.content_analyzer.field_content_production_techniques.tf_idf import WhooshTfIdf, SkLearnTfIdf 6 | from clayrs.content_analyzer.raw_information_source import JSONFile 7 | from test import dir_test_files 8 | 9 | THIS_DIR = os.path.dirname(os.path.abspath(__file__)) 10 | file_path = os.path.join(dir_test_files, "movies_info_reduced.json") 11 | 12 | 13 | class TestWhooshTfIdf(TestCase): 14 | def test_produce_content(self): 15 | technique = WhooshTfIdf() 16 | 17 | features_bag_list = technique.produce_content("Plot", [], [], JSONFile(file_path)) 18 | 19 | self.assertEqual(len(features_bag_list), 20) 20 | self.assertIsInstance(features_bag_list[0], FeaturesBagField) 21 | 22 | 23 | class TestSkLearnTfIdf(TestCase): 24 | 25 | def test_produce_content(self): 26 | technique = SkLearnTfIdf() 27 | 28 | features_bag_list = technique.produce_content("Title", [], [], JSONFile(file_path)) 29 | 30 | self.assertEqual(len(features_bag_list), 20) 31 | self.assertIsInstance(features_bag_list[0], FeaturesBagField) 32 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/utils/check_tokenization.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | from nltk import data, download, sent_tokenize, word_tokenize 3 | 4 | # nltk corpus 5 | corpus_downloaded = False 6 | 7 | 8 | def check_tokenized(text): 9 | """ 10 | Tokenizes a text 11 | """ 12 | if type(text) is str: 13 | global corpus_downloaded 14 | 15 | if not corpus_downloaded: 16 | try: 17 | data.find('punkt') 18 | except LookupError: 19 | download('punkt') 20 | 21 | corpus_downloaded = True 22 | 23 | text = word_tokenize(text) 24 | 25 | return text 26 | 27 | 28 | def check_not_tokenized(text): 29 | """ 30 | Untokenizes a tokenized text 31 | """ 32 | if type(text) is list: 33 | text = ' '.join(text) 34 | 35 | return text 36 | 37 | 38 | def tokenize_in_sentences(text: Union[List[str], str]): 39 | """ 40 | Tokenizes a text into sentences 41 | """ 42 | global corpus_downloaded 43 | 44 | if not corpus_downloaded: 45 | try: 46 | data.find('punkt') 47 | except LookupError: 48 | download('punkt') 49 | 50 | corpus_downloaded = True 51 | 52 | return sent_tokenize(check_not_tokenized(text)) 53 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/sentence_embeddings.md: -------------------------------------------------------------------------------- 1 | # Sentence Embeddings 2 | 3 | Via the following, you can obtain embeddings of ***sentence*** granularity 4 | 5 | ```python 6 | from clayrs import content_analyzer as ca 7 | 8 | # obtain sentence embeddings using pre-trained model 'glove-twitter-50' 9 | # from SBERT library 10 | ca.SentenceEmbeddingTechnique(embedding_source=ca.Sbert('paraphrase-distilroberta-base-v1')) 11 | ``` 12 | 13 | ::: clayrs.content_analyzer.SentenceEmbeddingTechnique 14 | handler: python 15 | options: 16 | show_root_toc_entry: true 17 | show_root_heading: true 18 | 19 | ## Sentence Embedding models 20 | 21 | ::: clayrs.content_analyzer.BertTransformers 22 | handler: python 23 | options: 24 | heading_level: 3 25 | show_root_toc_entry: true 26 | show_root_heading: true 27 | 28 | ::: clayrs.content_analyzer.Sbert 29 | handler: python 30 | options: 31 | heading_level: 3 32 | show_root_toc_entry: true 33 | show_root_heading: true 34 | 35 | ::: clayrs.content_analyzer.T5Transformers 36 | handler: python 37 | options: 38 | heading_level: 3 39 | show_root_toc_entry: true 40 | show_root_heading: true 41 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/ratings_manager/sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | from textblob import TextBlob 2 | 3 | from clayrs.content_analyzer.ratings_manager.score_processor import SentimentAnalysis 4 | 5 | 6 | class TextBlobSentimentAnalysis(SentimentAnalysis): 7 | """ 8 | Class that compute sentiment polarity on a textual field using TextBlob library. 9 | 10 | The given score will be in the $[-1.0, 1.0]$ range 11 | """ 12 | def __init__(self, decimal_rounding: int = None): 13 | super().__init__(decimal_rounding) 14 | 15 | def __str__(self): 16 | return "TextBlobSentimentalAnalysis" 17 | 18 | def __repr__(self): 19 | return f'TextBlobSentimentAnalysis' 20 | 21 | def fit(self, score_data: str) -> float: 22 | """ 23 | This method calculates the sentiment polarity score on textual reviews 24 | 25 | Args: 26 | score_data: text for which sentiment polarity must be computed and considered as score 27 | 28 | Returns: 29 | The sentiment polarity of the textual data in range $[-1.0, 1.0]$ 30 | """ 31 | polarity_score = TextBlob(score_data).sentiment.polarity 32 | 33 | if self.decimal_rounding: 34 | polarity_score = round(polarity_score, self.decimal_rounding) 35 | 36 | return polarity_score 37 | -------------------------------------------------------------------------------- /test/test_files/test_decode/movies_title_string.json: -------------------------------------------------------------------------------- 1 | [{"Title":"test","Year":"1995","Rated":"PG","Released":"15 Dec 1995","Runtime":"104 min","Genre":"Adventure, Family, Fantasy","Director":"Joe Johnston","Writer":"Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Actors":"Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","Plot":"After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","Language":"English, French","Country":"USA","Awards":"4 wins & 9 nominations.","Poster":"https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","Ratings":[{"Source":"Internet Movie Database","Value":"6.9/10"},{"Source":"Rotten Tomatoes","Value":"53%"},{"Source":"Metacritic","Value":"39/100"}],"Metascore":"39","imdbRating":"6.9","imdbVotes":"260,909","imdbID":"tt0113497","Type":"movie","DVD":"25 Jan 2000","BoxOffice":"N/A","Production":"Sony Pictures Home Entertainment","Website":"N/A","Response":"True"}] -------------------------------------------------------------------------------- /test/test_files/test_decode/movies_title_tfidf.json: -------------------------------------------------------------------------------- 1 | [{"Title":"{'jumanji': 2}","Year":"1995","Rated":"PG","Released":"15 Dec 1995","Runtime":"104 min","Genre":"Adventure, Family, Fantasy","Director":"Joe Johnston","Writer":"Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Actors":"Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","Plot":"After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","Language":"English, French","Country":"USA","Awards":"4 wins & 9 nominations.","Poster":"https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","Ratings":[{"Source":"Internet Movie Database","Value":"6.9/10"},{"Source":"Rotten Tomatoes","Value":"53%"},{"Source":"Metacritic","Value":"39/100"}],"Metascore":"39","imdbRating":"6.9","imdbVotes":"260,909","imdbID":"tt0113497","Type":"movie","DVD":"25 Jan 2000","BoxOffice":"N/A","Production":"Sony Pictures Home Entertainment","Website":"N/A","Response":"True"}] -------------------------------------------------------------------------------- /clayrs/utils/automatic_methods.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | 4 | def autorepr(obj, frame): 5 | # pull tuple from frame 6 | args, args_paramname, kwargs_paramname, values = inspect.getargvalues(frame) 7 | 8 | args = args[1:] # remove 'self' argument from function 9 | 10 | arg_string = '' 11 | 12 | # add to arg string formal argument 13 | arg_string += ', '.join([f"{arg}={repr(values[arg])}" 14 | for arg in (args if args is not None else [])]) 15 | 16 | # show positional varargs 17 | if args_paramname is not None: 18 | varglist = values[args_paramname] 19 | if len(arg_string) != 0: 20 | arg_string += ', ' 21 | arg_string += ', '.join([f"*{args_paramname}={repr(v)}" 22 | for v in (varglist if varglist is not None else [])]) 23 | 24 | # show named varargs 25 | if kwargs_paramname is not None: 26 | varglist = values[kwargs_paramname] 27 | if len(arg_string) != 0: 28 | arg_string += ', ' 29 | arg_string += ', '.join([f"*{kwargs_paramname}_{k}={repr(varglist[k])}" 30 | for k in (sorted(varglist) if varglist is not None else [])]) 31 | 32 | name_obj = obj.__class__.__name__ 33 | repr_string = f"{name_obj}({arg_string})" 34 | 35 | return repr_string 36 | -------------------------------------------------------------------------------- /.github/workflows/testing_pipeline.yml: -------------------------------------------------------------------------------- 1 | name: Testing pipeline 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | workflow_dispatch: 10 | 11 | jobs: 12 | main: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ['3.8', '3.10'] 17 | 18 | steps: 19 | - 20 | name: Checkout Repository 21 | uses: actions/checkout@v3 22 | - 23 | name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v4 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - 28 | name: Setup installation requirements 29 | run: | 30 | pip install --upgrade pip wheel 31 | pip install pytest-cov==3.00 32 | pip install -e . 33 | - 34 | name: Testing Python ${{ matrix.python-version }} with coverage 35 | run: | 36 | pytest --color=yes --cov-report xml:codecoverage_${{ matrix.python-version }}.xml --cov=clayrs test/ 37 | - 38 | name: Upload coverage to Codecov 39 | uses: codecov/codecov-action@v3 40 | with: # no token required for public repos 41 | fail_ci_if_error: true 42 | files: ./codecoverage_${{ matrix.python-version }}.xml 43 | flags: python_${{ matrix.python-version }} 44 | -------------------------------------------------------------------------------- /clayrs/recsys/content_based_algorithm/centroid_vector/similarities.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Union 3 | 4 | import numpy as np 5 | from scipy import sparse 6 | from sklearn.metrics.pairwise import cosine_similarity 7 | 8 | 9 | class Similarity(ABC): 10 | """ 11 | Abstract Class for the various types of similarity 12 | """ 13 | def __init__(self): 14 | pass 15 | 16 | @abstractmethod 17 | def perform(self, v1: Union[np.ndarray, sparse.csr_matrix], v2: Union[np.ndarray, sparse.csr_matrix]): 18 | """ 19 | Calculates the similarity between v1 and v2 20 | """ 21 | raise NotImplementedError 22 | 23 | 24 | class CosineSimilarity(Similarity): 25 | """ 26 | Computes cosine similarity 27 | """ 28 | def __init__(self): 29 | super().__init__() 30 | 31 | def perform(self, v1: Union[np.ndarray, sparse.csr_matrix], v2: Union[np.ndarray, sparse.csr_matrix]): 32 | """ 33 | Calculates the cosine similarity between v1 and v2 34 | 35 | Args: 36 | v1: first numpy array 37 | v2: second numpy array 38 | """ 39 | 40 | return cosine_similarity(v1, v2, dense_output=True) 41 | 42 | def __str__(self): 43 | return "CosineSimilarity" 44 | 45 | def __repr__(self): 46 | return f"CosineSimilarity()" 47 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/contextualized_embeddings.md: -------------------------------------------------------------------------------- 1 | # Contextualized Embeddings 2 | 3 | Via the following, you can obtain embeddings of *finer* granularity from models which are able to return also 4 | embeddings of *coarser* granularity (e.g. obtain word embeddings from a model which is also able to return sentence 5 | embeddings). 6 | 7 | For now only models working at sentence and token level are implemented 8 | 9 | ```python 10 | from clayrs import content_analyzer as ca 11 | 12 | # obtain sentence embeddings combining token embeddings with a 13 | # centroid technique 14 | ca.Sentence2WordEmbedding(embedding_source=ca.BertTransformers('bert-base-uncased')) 15 | ``` 16 | 17 | ::: clayrs.content_analyzer.Sentence2WordEmbedding 18 | handler: python 19 | options: 20 | show_root_toc_entry: true 21 | show_root_heading: true 22 | 23 | ## Model able to return sentence and token embeddings 24 | 25 | ::: clayrs.content_analyzer.BertTransformers 26 | handler: python 27 | options: 28 | heading_level: 3 29 | show_root_toc_entry: true 30 | show_root_heading: true 31 | 32 | ::: clayrs.content_analyzer.T5Transformers 33 | handler: python 34 | options: 35 | heading_level: 3 36 | show_root_toc_entry: true 37 | show_root_heading: true 38 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/utils/id_merger.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | 4 | def id_values_merger(id_values: List[str]): 5 | """ 6 | This function is used to compact a list of ids into a unique string. This can be useful when 7 | there is content whose id is composed by values coming from more than one field. 8 | 9 | Args: 10 | id_values (List): List containing one or more ids 11 | 12 | Returns: 13 | id_merged (str): String in which the values contained in the list given in input are 14 | merged 15 | """ 16 | if type(id_values) == str or type(id_values) == int: 17 | return str(id_values) 18 | elif type(id_values) == list: 19 | id_merged = "" 20 | for i in range(len(id_values)): 21 | id_merged += str(id_values[i]) 22 | if i != len(id_values) - 1: 23 | id_merged += "_" 24 | return id_merged 25 | else: 26 | raise TypeError("id must be an integer, a string or a list of strings and/or integer") 27 | 28 | 29 | def id_merger(raw_content: dict, field_list: List[str]) -> str: 30 | """ 31 | Function that creates the list of ids and then calls id_values_merger to create a unique id 32 | """ 33 | id_values = [] 34 | for field_name in field_list: 35 | id_values.append(raw_content[field_name]) 36 | 37 | return id_values_merger(id_values) 38 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/recsys/graph_based/feature_selection.md: -------------------------------------------------------------------------------- 1 | # Feature Selection 2 | 3 | Via the `feature_selecter` function you are able to perform feature selection on a given graph, by keeping properties 4 | that are the most important according to a given ***feature selection algorithm***. Check the documentation of the 5 | method for more and for a *usage example* 6 | 7 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_fn 8 | handler: python 9 | 10 | --- 11 | 12 | ## Feature Selection algorithms 13 | 14 | The following are the feature selection algorithms you can use in the `fs_algorithms_user` 15 | and/or in the `fs_algorithm_item` 16 | 17 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_alg.TopKPageRank 18 | handler: python 19 | options: 20 | heading_level: 3 21 | show_root_toc_entry: true 22 | show_root_heading: true 23 | members: none 24 | 25 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_alg.TopKEigenVectorCentrality 26 | handler: python 27 | options: 28 | heading_level: 3 29 | show_root_toc_entry: true 30 | show_root_heading: true 31 | members: none 32 | 33 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_alg.TopKDegreeCentrality 34 | handler: python 35 | options: 36 | heading_level: 3 37 | show_root_toc_entry: true 38 | show_root_heading: true 39 | members: none 40 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_loader/gensim.py: -------------------------------------------------------------------------------- 1 | from gensim import downloader 2 | import numpy as np 3 | 4 | from clayrs.utils.const import logger 5 | from clayrs.content_analyzer.embeddings.embedding_loader.embedding_loader import WordEmbeddingLoader 6 | 7 | 8 | class Gensim(WordEmbeddingLoader): 9 | """ 10 | Class that produces word embeddings using gensim pre-trained models. 11 | 12 | The model will be automatically downloaded using the gensim downloader api if not present locally. 13 | 14 | Args: 15 | model_name: Name of the model to load/download 16 | """ 17 | 18 | def __init__(self, model_name: str = 'glove-twitter-25'): 19 | super().__init__(model_name) 20 | 21 | def get_vector_size(self) -> int: 22 | return self.model.vector_size 23 | 24 | def get_embedding(self, word: str) -> np.ndarray: 25 | return self.model[word] 26 | 27 | def load_model(self): 28 | # if the reference isn't in the possible models, FileNotFoundError is raised 29 | if self.reference in downloader.info()['models']: 30 | logger.info(f"Downloading/Loading {str(self)}") 31 | 32 | return downloader.load(self.reference) 33 | else: 34 | raise FileNotFoundError 35 | 36 | def __str__(self): 37 | return f"Gensim {self.reference}" 38 | 39 | def __repr__(self): 40 | return f'Gensim(model_name={self.reference}' 41 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/information_preprocessors/postprocessors/postprocessor.md: -------------------------------------------------------------------------------- 1 | # Postprocessor 2 | 3 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.CountVisualBagOfWords 4 | handler: python 5 | options: 6 | show_root_toc_entry: true 7 | show_root_heading: true 8 | 9 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.TfIdfVisualBagOfWords 10 | handler: python 11 | options: 12 | show_root_toc_entry: true 13 | show_root_heading: true 14 | 15 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.ScipyVQ 16 | handler: python 17 | options: 18 | show_root_toc_entry: true 19 | show_root_heading: true 20 | 21 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.SkLearnPCA 22 | handler: python 23 | options: 24 | show_root_toc_entry: true 25 | show_root_heading: true 26 | 27 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.SkLearnGaussianRandomProjections 28 | handler: python 29 | options: 30 | show_root_toc_entry: true 31 | show_root_heading: true 32 | 33 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.SkLearnFeatureAgglomeration 34 | handler: python 35 | options: 36 | show_root_toc_entry: true 37 | show_root_heading: true 38 | -------------------------------------------------------------------------------- /test/content_analyzer/information_processor/test_visualpreprocessors/test_torch_builtin_augmenter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from torchvision import transforms 4 | 5 | import clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_augmenter as clayrs_augments 6 | 7 | from test.content_analyzer.information_processor.test_visualpreprocessors.test_torch_builtin_transformer import \ 8 | TestTorchBuiltInTransformer 9 | 10 | 11 | class TestTorchAutoAugment(TestTorchBuiltInTransformer): 12 | 13 | def setUp(self): 14 | self.technique = clayrs_augments.TorchAutoAugment() 15 | self.og_technique = transforms.AutoAugment() 16 | 17 | def test_forward(self): 18 | self.assertTrue(self.expected_result_equal()) 19 | 20 | 21 | class TestTorchRandAugment(TestTorchBuiltInTransformer): 22 | 23 | def setUp(self): 24 | self.technique = clayrs_augments.TorchRandAugment() 25 | self.og_technique = transforms.RandAugment() 26 | 27 | def test_forward(self): 28 | self.assertTrue(self.expected_result_equal()) 29 | 30 | 31 | class TestTorchTrivialAugmentWide(TestTorchBuiltInTransformer): 32 | 33 | def setUp(self): 34 | self.technique = clayrs_augments.TorchTrivialAugmentWide() 35 | self.og_technique = transforms.TrivialAugmentWide() 36 | 37 | def test_forward(self): 38 | self.assertTrue(self.expected_result_equal()) 39 | 40 | 41 | if __name__ == "__main__": 42 | unittest.main() 43 | -------------------------------------------------------------------------------- /clayrs/utils/custom_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional, Dict 3 | 4 | from colorama import Fore, Back, Style 5 | 6 | 7 | class ColoredFormatter(logging.Formatter): 8 | """Colored log formatter.""" 9 | 10 | def __init__(self, *args, colors: Optional[Dict[str, str]] = None, **kwargs) -> None: 11 | """Initialize the formatter with specified format strings.""" 12 | 13 | super().__init__(*args, **kwargs) 14 | 15 | self.colors = colors if colors else {} 16 | 17 | def format(self, record) -> str: 18 | """Format the specified record as text.""" 19 | 20 | record.color = self.colors.get(record.levelname, '') 21 | record.reset = Style.RESET_ALL 22 | 23 | return super().format(record) 24 | 25 | 26 | formatter = ColoredFormatter( 27 | "\r{color}{levelname}{reset} - {message}", 28 | style='{', 29 | colors={ 30 | 'DEBUG': Fore.CYAN, 31 | 'INFO': Fore.RESET, 32 | 'WARNING': Fore.YELLOW, 33 | 'ERROR': Fore.RED, 34 | 'CRITICAL': Fore.BLACK + Back.RED + Style.BRIGHT, 35 | } 36 | ) 37 | 38 | 39 | def get_custom_logger(name: str): 40 | handler = logging.StreamHandler() 41 | handler.setFormatter(formatter) 42 | 43 | logger = logging.getLogger(name) 44 | if len(logger.handlers) != 0: 45 | logger.handlers.clear() 46 | 47 | logger.addHandler(handler) 48 | logger.setLevel(logging.INFO) 49 | logger.propagate = False 50 | return logger 51 | -------------------------------------------------------------------------------- /clayrs/recsys/content_based_algorithm/exceptions.py: -------------------------------------------------------------------------------- 1 | class UserSkipAlgFit(Exception): 2 | """ 3 | Super class for exception related to the fit of a single user. If one of the exception happens, the algorithm 4 | can't be fitted for the user, therefore will be skipped 5 | """ 6 | pass 7 | 8 | 9 | class OnlyPositiveItems(UserSkipAlgFit): 10 | """ 11 | Exception to raise when there's only positive items available locally for the user 12 | """ 13 | pass 14 | 15 | 16 | class OnlyNegativeItems(UserSkipAlgFit): 17 | """ 18 | Exception to raise when there's only negative items available locally for the user 19 | """ 20 | pass 21 | 22 | 23 | class NoRatedItems(UserSkipAlgFit): 24 | """ 25 | Exception to raise when there's no item available locally for the user 26 | """ 27 | pass 28 | 29 | 30 | class EmptyUserRatings(UserSkipAlgFit): 31 | """ 32 | Exception to raise when the user ratings is empty 33 | """ 34 | pass 35 | 36 | 37 | class NotRankingAlg(Exception): 38 | """ 39 | Exception to raise when the algorithm is not a ranking algorithm, but it is asked to rank 40 | """ 41 | pass 42 | 43 | 44 | class NotPredictionAlg(Exception): 45 | """ 46 | Exception to raise when the algorithm is not a prediction algorithm, but it is asked to predict 47 | """ 48 | pass 49 | 50 | 51 | class NotFittedAlg(Exception): 52 | """ 53 | Exception to raise when the algorithm has not been fitted 54 | """ 55 | pass 56 | -------------------------------------------------------------------------------- /clayrs/utils/class_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Type, Set 2 | import inspect 3 | 4 | 5 | def get_all_implemented_subclasses(cls: Type) -> Set: 6 | """ 7 | Method that retrieves all implemented subclasses of a given class 8 | (also considering subclasses of a subclass and so on) 9 | 10 | The method calls itself to find the subclasses of each subclass 11 | 12 | Args: 13 | cls (Type): class from which all implemented subclasses will be extracted 14 | 15 | Returns: 16 | set containing all of cls' implemented subclasses 17 | """ 18 | return set([sub for sub in cls.__subclasses__() if not inspect.isabstract(sub)]).union( 19 | [sub for c in cls.__subclasses__() for sub in get_all_implemented_subclasses(c) if not inspect.isabstract(sub)]) 20 | 21 | 22 | def get_all_implemented_classes(cls: Type) -> Set: 23 | """ 24 | Method that retrieves all implemented subclasses of a given class 25 | (also considering subclasses of a subclass and so on) 26 | 27 | The method calls itself to find the subclasses of each subclass 28 | 29 | If the class passed as argument is not abstract, it is added to the set's results 30 | 31 | Args: 32 | cls (Type): class from which all implemented subclasses will be extracted 33 | 34 | Returns: 35 | set containing all of cls' implemented subclasses and cls itself if it is not abstract 36 | """ 37 | 38 | classes = get_all_implemented_subclasses(cls) 39 | 40 | if not inspect.isabstract(cls): 41 | classes.add(cls) 42 | 43 | return classes 44 | -------------------------------------------------------------------------------- /test/test_files/test_decode/movies_title_embedding.json: -------------------------------------------------------------------------------- 1 | [{"Title":"[0.10984,-0.72454,1.21200001,-0.16188,-0.77879,-0.31345001,-0.27814001,-0.27860001,0.33089,-0.62764001,0.31617999,0.34035999,-0.66911,-0.52311999,1.66120005,1.10749996,0.25200999,0.098685,0.96275002,0.66688001,-0.33248001,0.22236,0.67574,-1.01069999,0.27109]","Year":"1995","Rated":"PG","Released":"15 Dec 1995","Runtime":"104 min","Genre":"Adventure, Family, Fantasy","Director":"Joe Johnston","Writer":"Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Actors":"Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","Plot":"After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","Language":"English, French","Country":"USA","Awards":"4 wins & 9 nominations.","Poster":"https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","Ratings":[{"Source":"Internet Movie Database","Value":"6.9/10"},{"Source":"Rotten Tomatoes","Value":"53%"},{"Source":"Metacritic","Value":"39/100"}],"Metascore":"39","imdbRating":"6.9","imdbVotes":"260,909","imdbID":"tt0113497","Type":"movie","DVD":"25 Jan 2000","BoxOffice":"N/A","Production":"Sony Pictures Home Entertainment","Website":"N/A","Response":"True"}] -------------------------------------------------------------------------------- /clayrs/content_analyzer/exceptions.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | 3 | import numpy as np 4 | 5 | 6 | def handler_score_not_float(func): 7 | """ 8 | Handler that catches the above exception. 9 | 10 | Tries to run the functions normally, if one of the above exceptions is caught then it must return 11 | an empty frame for the user since predictions can't be calculated for it. 12 | """ 13 | @wraps(func) 14 | def inner_function(*args, **kwargs): 15 | try: 16 | return func(*args, **kwargs) 17 | except ValueError: 18 | raise ValueError("The 'score' and 'timestamp' columns must contains numbers!\n" 19 | "Try to apply a score processor or change columns!") from None 20 | 21 | return inner_function 22 | 23 | 24 | def handler_empty_matrix(dtype): 25 | 26 | def handler_for_function(func): 27 | """ 28 | Handler that catches the above exception. 29 | 30 | Tries to run the functions normally, if one of the above exceptions is caught then it must return 31 | an empty frame for the user since predictions can't be calculated for it. 32 | """ 33 | @wraps(func) 34 | def inner_function(*args, **kwargs): 35 | try: 36 | return func(*args, **kwargs) 37 | except IndexError: 38 | return np.array([], dtype=dtype) 39 | 40 | return inner_function 41 | 42 | return handler_for_function 43 | 44 | 45 | class UserNone(Exception): 46 | pass 47 | 48 | 49 | class ItemNone(Exception): 50 | pass 51 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/test_embedding_source.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import numpy as np 4 | from unittest import TestCase 5 | from math import isclose 6 | 7 | 8 | class TestEmbeddingSource(TestCase): 9 | 10 | # Will be used by several test involving embeddings 11 | def assertWordEmbeddingMatches(self, source, embedding: np.ndarray, word: str): 12 | # 'similar_by_vector()' returns a list with top n 13 | # words similar to the vector given. I'm interested only in the most similar 14 | # so n = 1 15 | # for example, top_1 will be in the following form ("title", 1.0) 16 | top_1 = source.model.similar_by_vector(embedding, 1)[0] 17 | 18 | # So I'm using indices to access the tuples values. 19 | # 'like' contains how similar is 'embedding_word' to the 'embedding' vector given 20 | embedding_word = top_1[0] 21 | like = top_1[1] 22 | 23 | # if the word associated with the embedding vector returned by the model doesn't match the word passed as 24 | # argument, AssertionError is raised 25 | if not embedding_word == word: 26 | raise AssertionError("Word %s is not %s" % (embedding_word, word)) 27 | 28 | # Obviously due to approximation the conversion won't return the 29 | # exact word, but if the likelihood it's equal to 1 with a maximum error of 'abs_tol' 30 | # I'm assuming it's exactly that word 31 | if not isclose(like, 1, abs_tol=1e-6): 32 | raise AssertionError("Word %s and result word %s do not match" % (embedding_word, word)) 33 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/word_embeddings.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | 3 | Via the following, you can obtain embeddings of ***word*** granularity 4 | 5 | ```python 6 | from clayrs import content_analyzer as ca 7 | 8 | # obtain word embeddings using pre-trained model 'glove-twitter-50' 9 | # from Gensim library 10 | ca.WordEmbeddingTechnique(embedding_source=ca.Gensim('glove-twitter-50')) 11 | ``` 12 | 13 | ::: clayrs.content_analyzer.WordEmbeddingTechnique 14 | handler: python 15 | options: 16 | show_root_toc_entry: true 17 | show_root_heading: true 18 | 19 | 20 | ## Word Embedding models 21 | 22 | ::: clayrs.content_analyzer.Gensim 23 | handler: python 24 | options: 25 | heading_level: 3 26 | show_root_toc_entry: true 27 | show_root_heading: true 28 | 29 | ::: clayrs.content_analyzer.GensimDoc2Vec 30 | handler: python 31 | options: 32 | heading_level: 3 33 | show_root_toc_entry: true 34 | show_root_heading: true 35 | 36 | ::: clayrs.content_analyzer.GensimFastText 37 | handler: python 38 | options: 39 | heading_level: 3 40 | show_root_toc_entry: true 41 | show_root_heading: true 42 | 43 | ::: clayrs.content_analyzer.GensimRandomIndexing 44 | handler: python 45 | options: 46 | heading_level: 3 47 | show_root_toc_entry: true 48 | show_root_heading: true 49 | 50 | ::: clayrs.content_analyzer.GensimWord2Vec 51 | handler: python 52 | options: 53 | heading_level: 3 54 | show_root_toc_entry: true 55 | show_root_heading: true 56 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_loader/sbert.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | import numpy as np 3 | 4 | from clayrs.content_analyzer.embeddings.embedding_loader.embedding_loader import SentenceEmbeddingLoader 5 | from clayrs.utils.const import logger 6 | 7 | 8 | class Sbert(SentenceEmbeddingLoader): 9 | """ 10 | Class that produces sentences embeddings using sbert. 11 | 12 | The model will be automatically downloaded if not present locally. 13 | 14 | Args: 15 | model_name_or_file_path: name of the model to download or path where the model is stored 16 | locally 17 | """ 18 | 19 | def __init__(self, model_name_or_file_path: str = 'paraphrase-distilroberta-base-v1'): 20 | super().__init__(model_name_or_file_path) 21 | 22 | def load_model(self): 23 | try: 24 | logger.info(f"Downloading/Loading {str(self)}") 25 | 26 | return SentenceTransformer(self.reference) 27 | except (OSError, AttributeError): 28 | raise FileNotFoundError("Model not found!") 29 | 30 | def get_vector_size(self) -> int: 31 | return self.model.get_sentence_embedding_dimension() 32 | 33 | def get_embedding(self, sentence: str) -> np.ndarray: 34 | return self.model.encode(sentence, show_progress_bar=False) 35 | 36 | def get_embedding_token(self, sentence: str) -> np.ndarray: 37 | raise NotImplementedError("The model chosen can't return token embeddings") 38 | 39 | def __str__(self): 40 | return f"Sbert {self.reference}" 41 | 42 | def __repr__(self): 43 | return f"Sbert(model_name_or_file_path={self.reference})" 44 | -------------------------------------------------------------------------------- /test/utils/test_context_managers.py: -------------------------------------------------------------------------------- 1 | import time 2 | from unittest import TestCase 3 | 4 | import tqdm 5 | 6 | from clayrs.utils.context_managers import get_progbar, get_iterator_parallel 7 | 8 | 9 | class TestContextManagers(TestCase): 10 | 11 | def test_get_progbar(self): 12 | 13 | with get_progbar(range(50), total=50) as pbar: 14 | self.assertIsInstance(pbar, tqdm.tqdm) 15 | 16 | expected_bar_format = "{desc} {percentage:.0f}%|{bar}| {n:}/{total_fmt} [{elapsed}<{remaining}]" 17 | result_bar_format = pbar.bar_format 18 | 19 | expected_list = list(range(50)) 20 | result_list = list(pbar) 21 | 22 | self.assertEqual(expected_bar_format, result_bar_format) 23 | self.assertEqual(expected_list, result_list) 24 | 25 | def test_get_iterator_parallel(self): 26 | 27 | def f(x): 28 | time.sleep(1) 29 | 30 | return x 31 | 32 | expected_list = list(range(5)) 33 | 34 | # single cpu 35 | with get_iterator_parallel(1, f, list(range(5))) as it: 36 | result_list = list(it) 37 | 38 | self.assertEqual(expected_list, result_list) 39 | 40 | # multi cpu 41 | with get_iterator_parallel(2, f, list(range(5))) as it: 42 | result_list = list(it) 43 | 44 | self.assertEqual(expected_list, result_list) 45 | 46 | # multi cpu with progbar 47 | with get_iterator_parallel(2, f, list(range(5)), progress_bar=True, total=5) as pbar: 48 | 49 | self.assertIsInstance(pbar, tqdm.tqdm) 50 | 51 | result_list = list(pbar) 52 | 53 | self.assertEqual(expected_list, result_list) 54 | -------------------------------------------------------------------------------- /test/test_files/users_70.dat: -------------------------------------------------------------------------------- 1 | 1::F::1::10::48067 2 | 2::M::56::16::70072 3 | 3::M::25::15::55117 4 | 4::M::45::7::02460 5 | 5::M::25::20::55455 6 | 6::F::50::9::55117 7 | 7::M::35::1::06810 8 | 8::M::25::12::11413 9 | 9::M::25::17::61614 10 | 10::F::35::1::95370 11 | 11::F::25::1::04093 12 | 12::M::25::12::32793 13 | 13::M::45::1::93304 14 | 14::M::35::0::60126 15 | 15::M::25::7::22903 16 | 16::F::35::0::20670 17 | 17::M::50::1::95350 18 | 18::F::18::3::95825 19 | 19::M::1::10::48073 20 | 20::M::25::14::55113 21 | 21::M::18::16::99353 22 | 22::M::18::15::53706 23 | 23::M::35::0::90049 24 | 24::F::25::7::10023 25 | 25::M::18::4::01609 26 | 26::M::25::7::23112 27 | 27::M::25::11::19130 28 | 28::F::25::1::14607 29 | 29::M::35::7::33407 30 | 30::F::35::7::19143 31 | 31::M::56::7::06840 32 | 32::F::25::0::19355 33 | 33::M::45::3::55421 34 | 34::F::18::0::02135 35 | 35::M::45::1::02482 36 | 36::M::25::3::94123 37 | 37::F::25::9::66212 38 | 38::F::18::4::02215 39 | 39::M::18::4::61820 40 | 40::M::45::0::10543 41 | 41::F::18::4::15116 42 | 42::M::25::8::24502 43 | 43::M::25::12::60614 44 | 44::M::45::17::98052 45 | 45::F::45::16::94110 46 | 46::M::18::19::75602 47 | 47::M::18::4::94305 48 | 48::M::25::4::92107 49 | 49::M::18::12::77084 50 | 50::F::25::2::98133 51 | 51::F::1::10::10562 52 | 52::M::18::4::72212 53 | 53::M::25::0::96931 54 | 54::M::50::1::56723 55 | 55::F::35::12::55303 56 | 56::M::35::20::60440 57 | 57::M::18::19::30350 58 | 58::M::25::2::30303 59 | 59::F::50::1::55413 60 | 60::M::50::1::72118 61 | 61::M::25::17::95122 62 | 62::F::35::3::98105 63 | 63::M::18::4::54902 64 | 64::M::18::1::53706 65 | 65::M::35::12::55803 66 | 66::M::25::18::57706 67 | 67::F::50::5::60181 68 | 68::M::18::4::53706 69 | 69::F::25::1::02143 70 | 70::M::18::4::53703 71 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from setuptools import setup 3 | 4 | with open('requirements.txt') as f: 5 | requirements = f.read().splitlines() 6 | 7 | with open("README.md", "r") as fh: 8 | long_description = fh.read() 9 | 10 | VERSION = "0.5.1" 11 | 12 | setup(name='clayrs', 13 | version=VERSION, 14 | license='GPL-3.0', 15 | author='Antonio Silletti, Elio Musacchio, Roberta Sallustio', 16 | install_requires=requirements, 17 | description='Complexly represent contents, build recommender systems, evaluate them. All in one place!', 18 | long_description=long_description, 19 | long_description_content_type="text/markdown", 20 | keywords=['recommender system', 'cbrs', 'evaluation', 'recsys'], 21 | url='https://github.com/swapUniba/ClayRS', 22 | include_package_data=True, 23 | packages=setuptools.find_packages(), 24 | python_requires='>=3.8', 25 | 26 | classifiers=[ 27 | 'Development Status :: 3 - Alpha', 28 | 'Intended Audience :: Developers', 29 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 30 | 'Operating System :: OS Independent', 31 | 'Programming Language :: Python', 32 | 'Programming Language :: Python :: 3', 33 | 'Programming Language :: Python :: 3 :: Only', 34 | 'Programming Language :: Python :: 3.8', 35 | 'Programming Language :: Python :: 3.9', 36 | 'Programming Language :: Python :: 3.10', 37 | 'Topic :: Software Development :: Libraries', 38 | 'Topic :: Software Development :: Libraries :: Python Modules', 39 | 'Topic :: Software Development :: Testing :: Unit' 40 | ] 41 | 42 | ) 43 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/combining_embeddings.md: -------------------------------------------------------------------------------- 1 | # Combine Embeddings 2 | 3 | Via the following, you can obtain embeddings of *coarser* granularity from models which return 4 | embeddings of *finer* granularity (e.g. obtain sentence embeddings from a model which returns word embeddings) 5 | 6 | ```python 7 | from clayrs import content_analyzer as ca 8 | 9 | # obtain sentence embeddings combining token embeddings with a 10 | # centroid technique 11 | ca.Word2SentenceEmbedding(embedding_source=ca.Gensim('glove-twitter-50'), 12 | combining_technique=ca.Centroid()) 13 | ``` 14 | 15 | ::: clayrs.content_analyzer.Word2SentenceEmbedding 16 | handler: python 17 | options: 18 | show_root_toc_entry: true 19 | show_root_heading: true 20 | 21 | ::: clayrs.content_analyzer.Word2DocEmbedding 22 | handler: python 23 | options: 24 | show_root_toc_entry: true 25 | show_root_heading: true 26 | 27 | ::: clayrs.content_analyzer.Sentence2DocEmbedding 28 | handler: python 29 | options: 30 | show_root_toc_entry: true 31 | show_root_heading: true 32 | 33 | ## Combining Techniques 34 | 35 | ::: clayrs.content_analyzer.Centroid 36 | handler: python 37 | options: 38 | heading_level: 3 39 | show_root_toc_entry: true 40 | show_root_heading: true 41 | 42 | ::: clayrs.content_analyzer.Sum 43 | handler: python 44 | options: 45 | heading_level: 3 46 | show_root_toc_entry: true 47 | show_root_heading: true 48 | 49 | ::: clayrs.content_analyzer.SingleToken 50 | handler: python 51 | options: 52 | heading_level: 3 53 | show_root_toc_entry: true 54 | show_root_heading: true -------------------------------------------------------------------------------- /test/content_analyzer/ratings_manager/test_rating_processor.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from clayrs.content_analyzer.ratings_manager.score_processor import NumberNormalizer 3 | 4 | 5 | class TestNumberNormalizer(TestCase): 6 | def test_fit(self): 7 | scores = [1, 2, 5, 5, 3, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 10] 8 | 9 | result = [] 10 | for score in scores: 11 | converted = NumberNormalizer(scale=(1, 10)).fit(score) 12 | result.append(converted) 13 | 14 | expected = [-1.0, -0.77777777, -0.11111111, -0.11111111, 15 | -0.55555555, -0.44444444, -0.42222222, -0.39999999, 16 | -0.37777777, -0.35555555, -0.33333333, 1.0] 17 | 18 | for expected_score, result_score in zip(expected, result): 19 | self.assertAlmostEqual(expected_score, result_score) 20 | 21 | # Test with rounding at the fourth digit 22 | result_rounded = [] 23 | for score in scores: 24 | converted_rounded = NumberNormalizer(scale=(1, 10), decimal_rounding=4).fit(score) 25 | result_rounded.append(converted_rounded) 26 | 27 | expected_rounded = [-1.0, -0.7778, -0.1111, -0.1111, -0.5556, 28 | -0.4444, -0.4222, -0.4, -0.3778, -0.3556, 29 | -0.3333, 1.0] 30 | 31 | for expected_score_rounded, result_score_rounded in zip(expected_rounded, result_rounded): 32 | self.assertAlmostEqual(expected_score_rounded, result_score_rounded) 33 | 34 | def test_error(self): 35 | 36 | # 2 numbers must be passed 37 | with self.assertRaises(ValueError): 38 | NumberNormalizer(scale=(1,)) 39 | 40 | with self.assertRaises(ValueError): 41 | NumberNormalizer(scale=(1, 2, 3)) 42 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/first_steps/colab_examples.md: -------------------------------------------------------------------------------- 1 | # Colab examples 2 | 3 | The GitHub repository hosts some IPython notebooks to get you start and running with the framework! 4 | 5 | To use them you could use [Google colab](https://colab.research.google.com/?hl=it): 6 | 7 | * Go to Colab and open `File > Open notebook` 8 |
9 | ![Image title](../img/colab_examples_1.png){ width="600" } 10 |
11 | 12 | * Then go to `GitHub` section, write ***swapUniba/ClayRS*** in the first text box and choose the example you want 13 | to run! 14 |
15 | ![Image title](../img/colab_examples_2.png){ width="600" } 16 |
17 | 18 | ## Available examples 19 | 20 | All the following use the ***Movielens 100k*** dataset 21 | 22 | * `1_tfidf_centroid.ipynb`: the easiest example, a good starting point for newcomers of the framework. 23 | It guides you in how to represent via *TfIdf* technique a field of the *raw source*, how to instantiate a 24 | `CentroidVector` algorithm and how to evaluate recommendations generated with several state-of-the-art metrics; 25 | 26 | * `2_embeddings_randomforest.ipynb`: a slightly more complex example, where *several fields* are represented 27 | with *several techniques*, including ***embedding techniques***. For the recommendation phase a 28 | `Random Forest` classifier is used; 29 | 30 | * `3_graph_pagerank.ipynb`: it will guide you on how to perform *graph based recommendation* via `ClayRS` 31 | (how to instantiate a graph, how to manipulate it, how to load exogenous properties). The *Personalized PageRank* 32 | algorithm is used in the recsys phase; 33 | 34 | * `4_evaluate_other_recs.ipynb`: a *jolly* example which shows how to export results (and intermediate results) 35 | obtained by `ClayRS`, but also how to evaluate ***external*** recommendation lists (i.e. recommendations generated via 36 | other tools) -------------------------------------------------------------------------------- /clayrs/evaluation/metrics/metrics.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from abc import ABC, abstractmethod 3 | from functools import wraps 4 | from typing import TYPE_CHECKING 5 | 6 | import numpy as np 7 | 8 | if TYPE_CHECKING: 9 | from clayrs.recsys.partitioning import Split 10 | 11 | 12 | class Metric(ABC): 13 | """ 14 | Abstract class that generalize metric concept 15 | 16 | Every metric may need different kind of "prediction": some (eg. NDCG, MRR, etc.) may need recommendation lists in 17 | which the recsys ranks every unseen item, some (eg. MAE, RMSE, etc.) may need a score prediction where the recsys 18 | must predict the rating that a user would give to an unseen item. 19 | So a Metric category (subclass of this class) must implement the "eval_fit_recsys(...)" specifying its needs, 20 | while every single metric (subclasses of the metric category class) must implement the "perform(...)" method 21 | specifying how to execute the metric computation 22 | """ 23 | 24 | @abstractmethod 25 | def __str__(self): 26 | raise NotImplementedError 27 | 28 | @abstractmethod 29 | def perform(self, split: Split): 30 | raise NotImplementedError 31 | 32 | 33 | def handler_different_users(func): 34 | """ 35 | Handler that covers the case in which there are different users between the predictions and the truth of a split: in 36 | that case a ValueError exception is raised 37 | """ 38 | @wraps(func) 39 | def inner_function(self, split, *args, **kwargs): 40 | 41 | if not np.array_equal(np.sort(split.pred.unique_user_id_column.flat), 42 | np.sort(split.truth.unique_user_id_column.flat)): 43 | raise ValueError("Predictions and truths must contain the same users!") 44 | 45 | return func(self, split, *args, **kwargs) 46 | 47 | return inner_function 48 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_loader/test_gensim_loader.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | from unittest import mock 3 | from unittest.mock import patch, Mock, MagicMock 4 | import numpy as np 5 | 6 | from test.content_analyzer.embeddings.test_embedding_source import TestEmbeddingSource 7 | from clayrs.content_analyzer.embeddings.embedding_loader.gensim import Gensim 8 | 9 | result_matrix = { 10 | 'title': np.array([random() for _ in range(25)]), 11 | 'plot': np.array([random() for _ in range(25)]) 12 | } 13 | 14 | 15 | def get_item(key): 16 | return result_matrix[key] 17 | 18 | 19 | def similar_by_vector(vector, n_to_find): 20 | for i, vec in enumerate(result_matrix.values()): 21 | if np.array_equal(vec, vector): 22 | return [(list(result_matrix.keys())[i], 1)] 23 | 24 | 25 | mocked_model = MagicMock() 26 | mocked_model.__getitem__.side_effect = get_item 27 | #mocked_model = MagicMock() 28 | mocked_model.similar_by_vector.side_effect = similar_by_vector 29 | mocked_model.vector_size = 25 30 | 31 | 32 | class TestGensimDownloader(TestEmbeddingSource): 33 | 34 | def test_load(self): 35 | 36 | with mock.patch('gensim.downloader.info', return_value={'models': 'glove-twitter-25'}): 37 | with mock.patch('gensim.downloader.load', return_value=mocked_model): 38 | source = Gensim('glove-twitter-25') 39 | 40 | # result is a matrix containing 2 rows, one for 'title', one for 'plot' 41 | result = source.load(["title", "plot"]) 42 | 43 | # the expected shape of result is (2, 25): 44 | # 2 for words and 25 due to the model 'glove-twitter-25' 45 | expected_shape = (2, 25) 46 | self.assertEqual(expected_shape, result.shape) 47 | 48 | self.assertWordEmbeddingMatches(source, result[0], "title") 49 | self.assertWordEmbeddingMatches(source, result[1], "plot") 50 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_learner/word2vec.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from gensim.models import Word2Vec 4 | 5 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimWordEmbeddingLearner 6 | 7 | 8 | class GensimWord2Vec(GensimWordEmbeddingLearner): 9 | """ 10 | Class that implements Word2Vec model thanks to the Gensim library. 11 | 12 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter. 13 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly 14 | represent 15 | 16 | If you'd like to save the model once trained, set the path in the `reference` parameter and set 17 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to 18 | produce contents in the current run 19 | 20 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/word2vec.html) 21 | to see what else can be customized 22 | 23 | Args: 24 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the 25 | trained model won't be saved after training and will only be used to produce contents in the current run 26 | auto_save: If True, the model will be saved in the path specified in `reference` parameter 27 | """ 28 | 29 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs): 30 | super().__init__(reference, auto_save, ".kv", **kwargs) 31 | 32 | def fit_model(self, corpus: List): 33 | self.model = Word2Vec(sentences=corpus, **self.additional_parameters).wv 34 | 35 | def __str__(self): 36 | return "GensimWord2Vec" 37 | 38 | def __repr__(self): 39 | return f'GensimWord2Vec(attributes={str(self.model)})' 40 | -------------------------------------------------------------------------------- /clayrs/utils/load_content.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import lzma 3 | import os 4 | import pickle 5 | 6 | from clayrs.content_analyzer.content_representation.representation_container import RepresentationContainer 7 | from clayrs.content_analyzer.content_representation.content import Content 8 | 9 | 10 | def load_content_instance(directory: str, content_id: str, only_field_representations: dict = None) -> Content: 11 | """ 12 | Loads a serialized content 13 | Args: 14 | directory: Path to the directory in which the content is stored 15 | content_id: ID of the content to load (its filename) 16 | only_field_representations: Specify exactly which representation to load for the content 17 | (e.g. {'Plot': 0, 'Genres': 1}). Useful for alleviating memory load 18 | 19 | Returns: 20 | content (Content) 21 | """ 22 | try: 23 | content_filename = os.path.join(directory, '{}.xz'.format(content_id)) 24 | with lzma.open(content_filename, "rb") as content_file: 25 | content = pickle.load(content_file) 26 | 27 | if only_field_representations is not None: 28 | smaller_content = Content(content_id) 29 | field_dict_smaller = {} 30 | for field, repr_id_list in only_field_representations.items(): 31 | field_dict_smaller[field] = [content.get_field_representation(field, repr_id) 32 | for repr_id in repr_id_list] 33 | 34 | for field, repr_list in field_dict_smaller.items(): 35 | ext_id_list = [id if isinstance(id, str) else None for id in only_field_representations[field]] 36 | field_repr_container = RepresentationContainer(repr_list, ext_id_list) 37 | smaller_content.append_field(field, field_repr_container) 38 | 39 | content = smaller_content 40 | 41 | except FileNotFoundError: 42 | content = None 43 | 44 | return content 45 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_learner/fasttext.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from gensim.models.fasttext import FastText 4 | 5 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimWordEmbeddingLearner 6 | 7 | 8 | class GensimFastText(GensimWordEmbeddingLearner): 9 | """ 10 | Class that implements FastText model thanks to the Gensim library. 11 | 12 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter. 13 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly 14 | represent 15 | 16 | If you'd like to save the model once trained, set the path in the `reference` parameter and set 17 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to 18 | produce contents in the current run 19 | 20 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/fasttext.html) 21 | to see what else can be customized 22 | 23 | Args: 24 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the 25 | trained model won't be saved after training and will only be used to produce contents in the current run 26 | auto_save: If True, the model will be saved in the path specified in `reference` parameter 27 | """ 28 | 29 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs): 30 | super().__init__(reference, auto_save, ".kv", **kwargs) 31 | 32 | def fit_model(self, corpus: List): 33 | self.model = FastText(sentences=corpus, **self.additional_parameters).wv 34 | 35 | def __str__(self): 36 | return "FastText" 37 | 38 | def __repr__(self): 39 | return f"FastText(reference={self.reference}, auto_save={self._auto_save}, " \ 40 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})" 41 | -------------------------------------------------------------------------------- /test/recsys/content_based_algorithm/test_contents_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | from os import listdir 4 | from os.path import splitext, isfile, join 5 | 6 | from clayrs.content_analyzer import SearchIndex 7 | from clayrs.recsys.content_based_algorithm.contents_loader import LoadedContentsDict, LoadedContentsIndex 8 | from test import dir_test_files 9 | 10 | 11 | class TestLoadedContentsDict(unittest.TestCase): 12 | def test_all(self): 13 | # test load_available_contents for content based algorithm 14 | movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/') 15 | 16 | interface_dict = LoadedContentsDict(movies_dir) 17 | 18 | # we are testing get_contents_interface 19 | self.assertIsInstance(interface_dict.get_contents_interface(), dict) 20 | 21 | # since we didn't specify which items to load, we expected it has loaded all items from the folder 22 | expected = {splitext(filename)[0] 23 | for filename in listdir(movies_dir) 24 | if isfile(join(movies_dir, filename)) and splitext(filename)[1] == ".xz"} 25 | 26 | # we are testing also iter 27 | result = set(interface_dict) 28 | self.assertEqual(expected, result) 29 | 30 | # test loaded contents specified 31 | interface_dict = LoadedContentsDict(movies_dir, {'tt0112281', 'tt0112302'}) 32 | 33 | # we are testing len 34 | self.assertTrue(len(interface_dict) == 2) 35 | 36 | # we are testing getitem 37 | self.assertIsNotNone(interface_dict['tt0112281']) 38 | self.assertIsNotNone(interface_dict['tt0112302']) 39 | 40 | # we are testing get 41 | self.assertIsNotNone(interface_dict.get('tt0112281')) 42 | self.assertIsNone(interface_dict.get('should be None')) 43 | 44 | 45 | class TestLoadedContentsIndex(unittest.TestCase): 46 | def test_all(self): 47 | index = "../test/test_files/index" 48 | 49 | self.assertIsInstance(LoadedContentsIndex(index).get_contents_interface(), SearchIndex) 50 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_learner/doc2vec.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument 4 | 5 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimWordEmbeddingLearner 6 | 7 | 8 | class GensimDoc2Vec(GensimWordEmbeddingLearner): 9 | """ 10 | Class that implements Doc2Vec model thanks to the Gensim library. 11 | 12 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter. 13 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly 14 | represent 15 | 16 | If you'd like to save the model once trained, set the path in the `reference` parameter and set 17 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to 18 | produce contents in the current run 19 | 20 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/doc2vec.html) 21 | to see what else can be customized 22 | 23 | Args: 24 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the 25 | trained model won't be saved after training and will only be used to produce contents in the current run 26 | auto_save: If True, the model will be saved in the path specified in `reference` parameter 27 | """ 28 | 29 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs): 30 | super().__init__(reference, auto_save, ".kv", **kwargs) 31 | 32 | def fit_model(self, corpus: List): 33 | tagged_data = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)] 34 | self.model = Doc2Vec(tagged_data, **self.additional_parameters).wv 35 | 36 | def __str__(self): 37 | return "GensimDoc2Vec" 38 | 39 | def __repr__(self): 40 | return f"GensimDoc2Vec(reference={self.reference}, auto_save={self._auto_save}, " \ 41 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})" 42 | -------------------------------------------------------------------------------- /test/content_analyzer/field_content_production_techniques/embedding_technique/test_combining_technique.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import numpy as np 3 | 4 | from clayrs.content_analyzer.field_content_production_techniques.embedding_technique.combining_technique import \ 5 | Centroid, Sum, SingleToken 6 | 7 | 8 | class TestCentroid(TestCase): 9 | def test_combine(self): 10 | z = np.ndarray(shape=(3, 3)) 11 | 12 | z[0, :] = [1, 1, 1] 13 | z[1, :] = [2, 2, 2] 14 | z[2, :] = [3, 3, 3] 15 | 16 | combiner = Centroid() 17 | result = combiner.combine(z) 18 | 19 | expected = np.ndarray(shape=(3, )) 20 | expected[:] = [2, 2, 2] 21 | 22 | self.assertTrue((result == expected).all()) 23 | 24 | 25 | class TestSum(TestCase): 26 | def test_combine(self): 27 | z = np.ndarray(shape=(3, 3)) 28 | 29 | z[0, :] = [1, 9, 1] 30 | z[1, :] = [7, 2, 4] 31 | z[2, :] = [3, 5, 3] 32 | 33 | combiner = Sum() 34 | result = combiner.combine(z) 35 | 36 | expected = np.ndarray(shape=(3, )) 37 | expected[:] = [11, 16, 8] 38 | 39 | self.assertTrue((result == expected).all()) 40 | 41 | 42 | class TestSingleToken(TestCase): 43 | def test_combine(self): 44 | z = np.ndarray(shape=(3, 3)) 45 | 46 | z[0, :] = [1, 9, 1] 47 | z[1, :] = [7, 2, 4] 48 | z[2, :] = [3, 5, 3] 49 | 50 | combiner = SingleToken(0) 51 | result = combiner.combine(z) 52 | 53 | expected = np.ndarray(shape=(3, )) 54 | expected[:] = [1, 9, 1] 55 | 56 | self.assertTrue((result == expected).all()) 57 | 58 | combiner = SingleToken(2) 59 | result = combiner.combine(z) 60 | 61 | expected = np.ndarray(shape=(3, )) 62 | expected[:] = [3, 5, 3] 63 | 64 | self.assertTrue((result == expected).all()) 65 | 66 | def test_raise(self): 67 | z = np.ndarray(shape=(3, 3)) 68 | 69 | z[0, :] = [1, 9, 1] 70 | z[1, :] = [7, 2, 4] 71 | z[2, :] = [3, 5, 3] 72 | 73 | with self.assertRaises(IndexError): 74 | SingleToken(99).combine(z) 75 | -------------------------------------------------------------------------------- /docs/mkdocs/docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | - toc 5 | --- 6 | 7 | !!! warning 8 | 9 | Docs are complete, but revision is still a Work in Progress. Sorry for any typos! 10 | 11 |

12 | ClayRS logo 13 |

14 | 15 | # Welcome to ClayRS's documentation! 16 | 17 | [![Build Status](https://github.com/swapUniba/ClayRS/actions/workflows/testing_pipeline.yml/badge.svg)](https://github.com/swapUniba/ClayRS/actions/workflows/testing_pipeline.yml)   18 | [![Docs](https://github.com/swapUniba/ClayRS/actions/workflows/docs_building.yml/badge.svg)](https://swapuniba.github.io/ClayRS/)   19 | [![codecov](https://codecov.io/gh/swapUniba/ClayRS/branch/master/graph/badge.svg?token=dftmT3QD8D)](https://codecov.io/gh/swapUniba/ClayRS)   20 | [![Python versions](https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10-blue)](https://www.python.org/downloads/) 21 | 22 | ***ClayRS*** is a python framework for (mainly) content-based recommender systems which allows you to perform several operations, starting from a raw representation of users and items to building and evaluating a recommender system. It also supports graph-based recommendation with feature selection algorithms and graph manipulation methods. 23 | 24 | The framework has three main modules, which you can also use individually: 25 | 26 |

27 | ClayRS 28 |

29 | 30 | Given a raw source, the ***Content Analyzer***: 31 | 32 | * Creates and serializes contents, 33 | * Using the chosen configuration 34 | 35 | The ***RecSys*** module allows to: 36 | 37 | * Instantiate a recommender system 38 | * *Using items and users serialized by the Content Analyzer* 39 | * Make score *prediction* or *recommend* items for the active user(s) 40 | 41 | The ***EvalModel*** has the task of evaluating a recommender system, using several state-of-the-art metrics 42 | 43 | The various sections of this documentation will guide you in becoming a full expert of **ClayRS**! 44 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_loader/vector_strategy.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import numpy as np 4 | import torch 5 | 6 | 7 | class VectorStrategy(ABC): 8 | """ 9 | Useful class in order to abstract the strategies that use more layers in order to obtain an only final 10 | representation of the model 11 | """ 12 | def __init__(self, last_interesting_layers: int): 13 | self.last_interesting_layers = last_interesting_layers 14 | 15 | @abstractmethod 16 | def build_embedding(self, token_embeddings) -> np.ndarray: 17 | raise NotImplementedError 18 | 19 | 20 | class SumStrategy(VectorStrategy): 21 | """ 22 | Class which sums the `last_interesting_layers` of the output obtained by the Transformer model 23 | 24 | Args: 25 | last_interesting_layers: Which layer to sum in order to summarize information 26 | """ 27 | def __init__(self, last_interesting_layers: int): 28 | super().__init__(last_interesting_layers) 29 | 30 | def build_embedding(self, token_embeddings: torch.Tensor) -> np.ndarray: 31 | token_vecs_sum = [] 32 | for token in token_embeddings: 33 | sum_vec = torch.sum(token[-self.last_interesting_layers:], dim=0) 34 | token_vecs_sum.append(sum_vec) 35 | return torch.stack(token_vecs_sum).numpy() 36 | 37 | def __str__(self): 38 | return "SumStrategy" 39 | 40 | def __repr__(self): 41 | return f"SumStrategy(last_interesting_layers={self.last_interesting_layers})" 42 | 43 | 44 | class CatStrategy(VectorStrategy): 45 | """ 46 | Class which concatenate the `last_interesting_layers` of the output obtained by the Transformer model 47 | 48 | Args: 49 | last_interesting_layers: Which layer to concatenate in order to summarize information 50 | """ 51 | def __init__(self, last_interesting_layers: int): 52 | super().__init__(last_interesting_layers) 53 | 54 | def build_embedding(self, token_embeddings: torch.Tensor) -> np.ndarray: 55 | token_vecs_cat = [] 56 | for token in token_embeddings: 57 | cat_vec = token[-1] 58 | for i in range(-2, -self.last_interesting_layers - 1, -1): 59 | cat_vec = torch.cat((cat_vec, token[i]), dim=0) 60 | token_vecs_cat.append(cat_vec) 61 | return torch.stack(token_vecs_cat).numpy() 62 | 63 | def __str__(self): 64 | return "CatStrategy" 65 | 66 | def __repr__(self): 67 | return f"CatStrategy(last_interesting_layers={self.last_interesting_layers})" 68 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_learner/test_random_indexing.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import os 4 | 5 | import gensim 6 | import numpy as np 7 | from gensim.corpora import Dictionary 8 | from gensim.models import RpModel 9 | from gensim.test.utils import common_texts 10 | from clayrs.content_analyzer.embeddings.embedding_learner.random_indexing import GensimRandomIndexing 11 | 12 | num_topics = 10 13 | model_path = 'test_model_ri' 14 | 15 | 16 | class TestRandomIndexing(TestCase): 17 | def test_all(self): 18 | my_learner = GensimRandomIndexing(model_path, num_topics=num_topics) 19 | 20 | corpus = common_texts 21 | my_learner.fit_model(corpus) 22 | 23 | # check that vector size is correct 24 | self.assertEqual(num_topics, my_learner.get_vector_size()) 25 | 26 | common_dictionary = Dictionary(common_texts) 27 | common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] 28 | expected_learner = RpModel(common_corpus, num_topics=num_topics) 29 | 30 | # test get_embedding not existent document 31 | unseen_doc_text = ['this', 'is', 'a', 'new', 'document', 'which', 'doesnt', 'exist'] 32 | 33 | # check that the doc is unseen (embedding has len 0) 34 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text) 35 | expected = expected_learner[unseen_doc] 36 | self.assertTrue(len(expected) == 0) 37 | 38 | # in our framework if the doc is unseen KeyError is raised 39 | with self.assertRaises(KeyError): 40 | my_learner.get_embedding(unseen_doc_text) 41 | 42 | # test get_embedding existent document 43 | unseen_doc_text = ['human', 'time', 'trees'] 44 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text) 45 | expected = expected_learner[unseen_doc] 46 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics) 47 | result_vector = my_learner.get_embedding(unseen_doc_text) 48 | 49 | # we don't have a way to check if the 2 vectors are the same, because they are build at random. 50 | # We just check that they are of the same length 51 | 52 | self.assertEqual(len(expected_vector), len(result_vector)) 53 | 54 | # test save 55 | my_learner.save() 56 | self.assertTrue(os.path.isfile(f"{model_path}.model")) 57 | 58 | # test that after load we obtain a valid embedding 59 | my_learner_loaded = GensimRandomIndexing(model_path) 60 | my_learner_loaded.load_model() 61 | unseen_doc_text = ['human', 'time', 'trees'] 62 | result_vector = my_learner.get_embedding(unseen_doc_text) 63 | 64 | self.assertTrue(np.any(result_vector)) 65 | -------------------------------------------------------------------------------- /test/test_files/complex_contents/create_complex_contents.py: -------------------------------------------------------------------------------- 1 | import clayrs.content_analyzer as ca 2 | 3 | items_json = "../movies_info_reduced.json" 4 | users_dat = "../users_70.dat" 5 | 6 | 7 | def item_fit(): 8 | config = ca.ItemAnalyzerConfig( 9 | ca.JSONFile(items_json), 10 | id='imdbID', 11 | output_directory='movies_codified/', 12 | export_json=True 13 | ) 14 | 15 | config.add_multiple_config( 16 | 'Plot', 17 | [ 18 | ca.FieldConfig(ca.SkLearnTfIdf(), 19 | ca.NLTK(stopwords_removal=True), id='tfidf'), 20 | ca.FieldConfig(ca.SentenceEmbeddingTechnique(ca.Sbert('paraphrase-distilroberta-base-v1')), 21 | ca.NLTK(stopwords_removal=True), id='embedding'), 22 | ca.FieldConfig(ca.OriginalData(), id='index_original', memory_interface=ca.SearchIndex('index')), 23 | ca.FieldConfig(ca.OriginalData(), ca.NLTK(stopwords_removal=True), 24 | id='index_preprocessed', memory_interface=ca.SearchIndex('index')), 25 | ] 26 | ) 27 | 28 | config.add_multiple_config( 29 | 'Genre', 30 | [ 31 | ca.FieldConfig(ca.WordEmbeddingTechnique(ca.Gensim('glove-twitter-25')), 32 | ca.NLTK(stemming=True), id='embedding'), 33 | ca.FieldConfig(ca.WhooshTfIdf(), 34 | ca.NLTK(stemming=True), id='tfidf'), 35 | ca.FieldConfig(ca.OriginalData(), id='index_original', memory_interface=ca.SearchIndex('index')), 36 | ca.FieldConfig(ca.OriginalData(), ca.NLTK(stopwords_removal=True), 37 | memory_interface=ca.SearchIndex('index')), 38 | ] 39 | ) 40 | 41 | config.add_multiple_config( 42 | 'Year', 43 | [ 44 | ca.FieldConfig(ca.OriginalData(), id='default_string'), 45 | ca.FieldConfig(ca.OriginalData(dtype=int), id='int') 46 | ] 47 | ) 48 | 49 | config.add_single_config( 50 | 'imdbRating', 51 | ca.FieldConfig(ca.OriginalData(dtype=float)) 52 | ) 53 | 54 | config.add_single_exogenous( 55 | ca.ExogenousConfig(ca.DBPediaMappingTechnique("dbo:Film", "Title"), id='dbpedia') 56 | ) 57 | 58 | ca.ContentAnalyzer(config).fit() 59 | 60 | 61 | def users_fit(): 62 | config = ca.UserAnalyzerConfig( 63 | ca.DATFile(users_dat), 64 | id='0', 65 | output_directory='users_codified', 66 | export_json=True 67 | ) 68 | 69 | config.add_single_exogenous( 70 | ca.ExogenousConfig(ca.PropertiesFromDataset(), id='local') 71 | ) 72 | 73 | ca.ContentAnalyzer(config).fit() 74 | 75 | 76 | if __name__ == "__main__": 77 | item_fit() 78 | users_fit() 79 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_learner/test_lda.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import os 4 | 5 | import numpy as np 6 | from gensim.corpora import Dictionary 7 | from gensim.models import LdaModel 8 | from gensim.test.utils import common_texts 9 | import gensim 10 | 11 | from clayrs.content_analyzer.embeddings.embedding_learner.lda import GensimLDA 12 | 13 | # we fix random_state for reproducibility 14 | random_state = 42 15 | num_topics = 100 16 | model_path = 'test_model_lda' 17 | 18 | 19 | class TestLda(TestCase): 20 | def test_all(self): 21 | my_learner = GensimLDA(model_path, num_topics=num_topics, random_state=random_state) 22 | 23 | corpus = common_texts 24 | my_learner.fit_model(corpus) 25 | 26 | # check that vector size is correct 27 | self.assertEqual(num_topics, my_learner.get_vector_size()) 28 | 29 | common_dictionary = Dictionary(common_texts) 30 | common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] 31 | expected_learner = LdaModel(common_corpus, num_topics=num_topics, random_state=random_state) 32 | 33 | # test get_embedding not existent document 34 | unseen_doc_text = ['this', 'is', 'a', 'new', 'document', 'which', 'doesnt', 'exist'] 35 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text) 36 | expected = expected_learner[unseen_doc] 37 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics) 38 | 39 | result_vector = my_learner.get_embedding(unseen_doc_text) 40 | 41 | self.assertTrue(np.array_equal(expected_vector, result_vector)) 42 | 43 | # test get_embedding existent document 44 | unseen_doc_text = ['human', 'time', 'trees'] 45 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text) 46 | expected = expected_learner[unseen_doc] 47 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics) 48 | 49 | result_vector = my_learner.get_embedding(unseen_doc_text) 50 | 51 | self.assertTrue(np.array_equal(expected_vector, result_vector)) 52 | 53 | # test save 54 | my_learner.save() 55 | self.assertTrue(os.path.isfile(f"{model_path}.model")) 56 | self.assertTrue(os.path.isfile(f"{model_path}.model.expElogbeta.npy")) 57 | self.assertTrue(os.path.isfile(f"{model_path}.model.id2word")) 58 | self.assertTrue(os.path.isfile(f"{model_path}.model.state")) 59 | 60 | # test that after load we obtain a valid embedding 61 | my_learner_loaded = GensimLDA(model_path) 62 | my_learner_loaded.load_model() 63 | unseen_doc_text = ['human', 'time', 'trees'] 64 | result_vector = my_learner.get_embedding(unseen_doc_text) 65 | 66 | self.assertTrue(np.any(result_vector)) 67 | -------------------------------------------------------------------------------- /test/content_analyzer/embeddings/embedding_learner/test_latent_semantic_analysis.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import os 4 | import gensim 5 | 6 | import numpy as np 7 | from gensim.models import LsiModel 8 | 9 | from clayrs.content_analyzer.embeddings.embedding_learner.latent_semantic_analysis import GensimLatentSemanticAnalysis 10 | from gensim.corpora import Dictionary 11 | from gensim.test.utils import common_texts 12 | 13 | num_topics = 10 14 | model_path = 'test_model_lda' 15 | 16 | 17 | class TestLda(TestCase): 18 | def test_all(self): 19 | my_learner = GensimLatentSemanticAnalysis(model_path, num_topics=num_topics) 20 | 21 | corpus = common_texts 22 | my_learner.fit_model(corpus) 23 | 24 | # check that vector size is correct 25 | self.assertEqual(num_topics, my_learner.get_vector_size()) 26 | 27 | common_dictionary = Dictionary(common_texts) 28 | common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] 29 | expected_learner = LsiModel(common_corpus, num_topics=num_topics) 30 | 31 | # test get_embedding not existent document 32 | unseen_doc_text = ['this', 'is', 'a', 'new', 'document', 'which', 'doesnt', 'exist'] 33 | 34 | # check that the doc is unseen (embedding has len 0) 35 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text) 36 | expected = expected_learner[unseen_doc] 37 | self.assertTrue(len(expected) == 0) 38 | 39 | # in our framework if the doc is unseen KeyError is raised 40 | with self.assertRaises(KeyError): 41 | my_learner.get_embedding(unseen_doc_text) 42 | 43 | # test get_embedding existent document 44 | unseen_doc_text = ['human', 'time', 'trees'] 45 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text) 46 | expected = expected_learner[unseen_doc] 47 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics) 48 | result_vector = my_learner.get_embedding(unseen_doc_text) 49 | 50 | # we don't have a way to check if the 2 vectors are the same, because they are build at random. 51 | # We just check that they are of the same length 52 | 53 | self.assertEqual(len(expected_vector), len(result_vector)) 54 | 55 | # test save 56 | my_learner.save() 57 | self.assertTrue(os.path.isfile(f"{model_path}.model")) 58 | self.assertTrue(os.path.isfile(f"{model_path}.model.projection")) 59 | 60 | # test that after load we obtain a valid embedding 61 | my_learner_loaded = GensimLatentSemanticAnalysis(model_path) 62 | my_learner_loaded.load_model() 63 | unseen_doc_text = ['human', 'time', 'trees'] 64 | result_vector = my_learner.get_embedding(unseen_doc_text) 65 | 66 | self.assertTrue(np.any(result_vector)) 67 | 68 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_learner/lda.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import gensim 4 | import numpy as np 5 | from gensim.corpora import Dictionary 6 | from gensim.models import LdaModel 7 | 8 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimDocumentEmbeddingLearner 9 | from clayrs.content_analyzer.utils.check_tokenization import check_tokenized 10 | 11 | 12 | class GensimLDA(GensimDocumentEmbeddingLearner): 13 | """ 14 | Class that implements Latent Dirichlet Allocation (LDA) thanks to the Gensim library. 15 | 16 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter. 17 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly 18 | represent 19 | 20 | If you'd like to save the model once trained, set the path in the `reference` parameter and set 21 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to 22 | produce contents in the current run 23 | 24 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/ldamodel.html) 25 | to see what else can be customized 26 | 27 | Args: 28 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the 29 | trained model won't be saved after training and will only be used to produce contents in the current run 30 | auto_save: If True, the model will be saved in the path specified in `reference` parameter 31 | """ 32 | 33 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs): 34 | super().__init__(reference, auto_save, ".model", **kwargs) 35 | 36 | def fit_model(self, corpus: List): 37 | dictionary = Dictionary(corpus) 38 | word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus] 39 | self.model = LdaModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters) 40 | 41 | def load_model(self): 42 | return LdaModel.load(self.reference) 43 | 44 | def get_vector_size(self) -> int: 45 | return self.model.num_topics 46 | 47 | def get_embedding(self, document_tokenized: List[str]) -> np.ndarray: 48 | unseen_doc = self.model.id2word.doc2bow(check_tokenized(document_tokenized)) 49 | sparse_vector = self.model[unseen_doc] 50 | 51 | dense_vector: np.ndarray = gensim.matutils.sparse2full(sparse_vector, self.model.num_topics) 52 | return dense_vector 53 | 54 | def __str__(self): 55 | return "GensimLda" 56 | 57 | def __repr__(self): 58 | return f"GensimLda(reference={self.reference}, auto_save={self._auto_save}, " \ 59 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})" 60 | -------------------------------------------------------------------------------- /test/utils/test_automatic_methods.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import unittest 3 | 4 | from clayrs.utils.automatic_methods import autorepr 5 | 6 | 7 | class TestAutomaticMethods(unittest.TestCase): 8 | 9 | @classmethod 10 | def setUpClass(cls) -> None: 11 | 12 | # init method with only positional attributes 13 | class OnlyPositional: 14 | def __init__(self, attribute1, attribute2): 15 | self._repr_string = autorepr(self, inspect.currentframe()) 16 | 17 | def __repr__(self): 18 | return self._repr_string 19 | 20 | cls.only_pos_class = OnlyPositional('formal1', 'formal2') 21 | 22 | # init method with only args attributes 23 | class OnlyArgs: 24 | def __init__(self, *args): 25 | self._repr_string = autorepr(self, inspect.currentframe()) 26 | 27 | def __repr__(self): 28 | return self._repr_string 29 | 30 | cls.only_args_class = OnlyArgs('only_args1', 'only_args2') 31 | 32 | # init method with only kwargs attributes 33 | class OnlyKwargs: 34 | def __init__(self, **kwargs): 35 | self._repr_string = autorepr(self, inspect.currentframe()) 36 | 37 | def __repr__(self): 38 | return self._repr_string 39 | 40 | cls.only_kwargs_class = OnlyKwargs(kwargs1='only_kwargs1', kwargs2='only_kwargs2') 41 | 42 | # init method with all possible attributes 43 | class AllPossibleArgs: 44 | def __init__(self, attribute1, attribute2, *args, **kwargs): 45 | self._repr_string = autorepr(self, inspect.currentframe()) 46 | 47 | def __repr__(self): 48 | return self._repr_string 49 | 50 | cls.all_possible_args_class = AllPossibleArgs('formal1', 'formal2', 'args1', 'args2', 'args3', 51 | kwargs1='kwargs_val') 52 | 53 | def test_autorepr(self): 54 | 55 | expected = "OnlyPositional(attribute1='formal1', attribute2='formal2')" 56 | result = repr(self.only_pos_class) 57 | 58 | self.assertEqual(expected, result) 59 | 60 | expected = "OnlyArgs(*args='only_args1', *args='only_args2')" 61 | result = repr(self.only_args_class) 62 | 63 | self.assertEqual(expected, result) 64 | 65 | expected = "OnlyKwargs(*kwargs_kwargs1='only_kwargs1', *kwargs_kwargs2='only_kwargs2')" 66 | result = repr(self.only_kwargs_class) 67 | 68 | self.assertEqual(expected, result) 69 | 70 | expected = "AllPossibleArgs(attribute1='formal1', attribute2='formal2', *args='args1', " \ 71 | "*args='args2', *args='args3', *kwargs_kwargs1='kwargs_val')" 72 | result = repr(self.all_possible_args_class) 73 | 74 | self.assertEqual(expected, result) 75 | 76 | 77 | if __name__ == '__main__': 78 | unittest.main() 79 | -------------------------------------------------------------------------------- /test/recsys/content_based_algorithm/centroid_vector/test_similarities.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from clayrs.recsys.content_based_algorithm.centroid_vector.similarities import CosineSimilarity 3 | import numpy as np 4 | from sklearn.metrics.pairwise import cosine_similarity 5 | from scipy import sparse 6 | 7 | 8 | class TestCosineSimilarity(TestCase): 9 | def test_perform(self): 10 | sim = CosineSimilarity() 11 | 12 | # vector comparison 13 | a = np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]]) 14 | b = np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6]]) 15 | 16 | res = sim.perform(a, b) 17 | expected = cosine_similarity(a, b, dense_output=True) 18 | 19 | np.testing.assert_allclose(expected, res) 20 | 21 | # single vector vs one matrix comparison 22 | a = np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]]) 23 | b = np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6], 24 | [8, 5, 5, 6, 2, 3, 10, 2, 3, 4], 25 | [1, 2, 2, 4, 4, 7, 6, 5, 5, 3]]) 26 | 27 | res = sim.perform(a, b) 28 | expected = cosine_similarity(a, b, dense_output=True) 29 | 30 | # check that we compute a similarity for each pair 31 | self.assertTrue(res.shape[0] == 1 and res.shape[1] == 3) 32 | 33 | np.testing.assert_allclose(expected, res) 34 | 35 | # sparse comparison 36 | a = sparse.csr_matrix(np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]])) 37 | b = sparse.csr_matrix(np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6]])) 38 | 39 | res = sim.perform(a, b) 40 | expected = cosine_similarity(a, b, dense_output=True) 41 | 42 | np.testing.assert_allclose(expected, res) 43 | 44 | # single sparse vs sparse matrix comparison 45 | a = sparse.csr_matrix(np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]])) 46 | b = sparse.csr_matrix(np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6], 47 | [8, 5, 5, 6, 2, 3, 10, 2, 3, 4], 48 | [1, 2, 2, 4, 4, 7, 6, 5, 5, 3]])) 49 | 50 | res = sim.perform(a, b) 51 | expected = cosine_similarity(a, b, dense_output=True) 52 | 53 | # check that we compute a similarity for each pair 54 | self.assertTrue(res.shape[0] == 1 and res.shape[1] == 3) 55 | 56 | np.testing.assert_allclose(expected, res) 57 | 58 | # single vector vs sparse matrix comparison 59 | a = np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]]) 60 | b = sparse.csr_matrix(np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6], 61 | [8, 5, 5, 6, 2, 3, 10, 2, 3, 4], 62 | [1, 2, 2, 4, 4, 7, 6, 5, 5, 3]])) 63 | 64 | res = sim.perform(a, b) 65 | expected = cosine_similarity(a, b, dense_output=True) 66 | 67 | # check that we compute a similarity for each pair 68 | self.assertTrue(res.shape[0] == 1 and res.shape[1] == 3) 69 | 70 | np.testing.assert_allclose(expected, res) 71 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/ratings_manager/score_processor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Tuple 3 | import numpy as np 4 | 5 | 6 | class ScoreProcessor(ABC): 7 | """ 8 | Abstract class to process a rating with the personalized fit method 9 | """ 10 | def __init__(self, decimal_rounding: int = None): 11 | self.__decimal_rounding = decimal_rounding 12 | 13 | @property 14 | def decimal_rounding(self): 15 | return self.__decimal_rounding 16 | 17 | @abstractmethod 18 | def fit(self, score_data: object): 19 | raise NotImplementedError 20 | 21 | def __repr__(self): 22 | return f'ScoreProcessor(decimal rounding={self.__decimal_rounding})' 23 | 24 | 25 | class SentimentAnalysis(ScoreProcessor): 26 | """ 27 | Abstract Class that generalizes the sentiment analysis technique 28 | """ 29 | 30 | @abstractmethod 31 | def fit(self, score_data: str): 32 | raise NotImplementedError 33 | 34 | @abstractmethod 35 | def __repr__(self): 36 | return f'SentimentAnalysis(decimal rounding={self.decimal_rounding})' 37 | 38 | 39 | class NumberNormalizer(ScoreProcessor): 40 | """ 41 | Class that normalizes numeric scores to a scale in the range $[-1.0, 1.0]$ 42 | 43 | Args: 44 | scale: Tuple where the first value is the minimum of the actual scale, second value is the maximum of the 45 | actual scale (e.g. `(1, 5)` represents an actual scale of scores from 1 (included) to 5 (included)) 46 | decimal_rounding: If set, the normalized score will be rounded to the chosen decimal digit 47 | """ 48 | def __init__(self, scale: Tuple[float, float], decimal_rounding: int = None): 49 | super().__init__(decimal_rounding) 50 | 51 | if len(scale) != 2: 52 | raise ValueError("The voting scale should be a tuple containing exactly two values," 53 | "the minimum of the scale and the maximum!") 54 | 55 | self._old_min = scale[0] 56 | self._old_max = scale[1] 57 | 58 | def __str__(self): 59 | return "NumberNormalizer" 60 | 61 | def __repr__(self): 62 | return f'NumberNormalizer(scale=({self._old_min}, {self._old_max}), decimal rounding={self.decimal_rounding})' 63 | 64 | def fit(self, score_data: float) -> float: 65 | """ 66 | Method which will normalize the given score 67 | 68 | Args: 69 | score_data: score that will be normalized 70 | 71 | Returns: 72 | score normalized in the interval $[-1, 1]$ 73 | """ 74 | def convert_into_range(value: float, old_min: float, old_max: float, new_min: int = -1, new_max: int = 1): 75 | new_value = ((value - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min 76 | if self.decimal_rounding: 77 | new_value = np.round(new_value, self.decimal_rounding) 78 | 79 | return new_value 80 | 81 | return convert_into_range(float(score_data), self._old_min, self._old_max) 82 | -------------------------------------------------------------------------------- /clayrs/utils/context_managers.py: -------------------------------------------------------------------------------- 1 | import concurrent 2 | import concurrent.futures 3 | import os 4 | from concurrent.futures import as_completed 5 | from typing import Union, Iterator 6 | 7 | import distex 8 | import contextlib 9 | 10 | from tqdm import tqdm 11 | from tqdm.contrib.logging import logging_redirect_tqdm 12 | 13 | from clayrs.utils.const import logger 14 | 15 | 16 | @contextlib.contextmanager 17 | def get_progbar(iterator, total=None) -> tqdm: 18 | bar_format = "{desc} {percentage:.0f}%|{bar}| {n:}/{total_fmt} [{elapsed}<{remaining}]" 19 | with logging_redirect_tqdm(loggers=[logger]): 20 | with tqdm(iterator, bar_format=bar_format, total=total) as pbar: 21 | yield pbar 22 | 23 | 24 | def handle_exception(loop, context): 25 | # this is a simple hack to stopping asyncio from logging "task was never retrieved" exception 26 | # that should not happen in the first place. 27 | # In fact this problem happens only on specific scenarios like Pycharm interpreter, or by running 28 | # an asyncio snippet as script, but does not happen if the exact same script is run interactively, 29 | # or in IPython environment 30 | pass 31 | 32 | 33 | @contextlib.contextmanager 34 | def get_iterator_parallel(num_cpus, f_to_parallelize, *args_to_f, 35 | progress_bar=False, total=None) -> Union[Iterator, tqdm]: 36 | 37 | num_cpus = num_cpus or os.cpu_count() or 1 38 | 39 | if num_cpus > 1: 40 | pool = distex.Pool(num_workers=num_cpus, func_pickle=distex.PickleType.cloudpickle) 41 | pool._loop.set_exception_handler(handle_exception) 42 | iterator_res = pool.map(f_to_parallelize, *args_to_f) 43 | else: 44 | pool = None 45 | iterator_res = map(f_to_parallelize, *args_to_f) 46 | 47 | try: 48 | if progress_bar: 49 | with get_progbar(iterator_res, total=total) as pbar: 50 | yield pbar 51 | else: 52 | yield iterator_res 53 | finally: 54 | if pool is not None: 55 | pool.shutdown() 56 | 57 | 58 | @contextlib.contextmanager 59 | def get_iterator_thread(max_workers, f_to_thread, *args_to_f, 60 | keep_order=False, progress_bar=False, total=None) -> Union[Iterator, tqdm]: 61 | 62 | # min(32, (os.cpu_count() or 1) + 4) taken from ThreadPoolExecutor 63 | max_workers = max_workers or min(32, (os.cpu_count() or 1) + 4) or 1 64 | 65 | if max_workers > 1: 66 | 67 | ex = concurrent.futures.ThreadPoolExecutor(max_workers) 68 | if keep_order: 69 | iterator_res = ex.map(f_to_thread, *args_to_f) 70 | else: 71 | iterator_res = as_completed([ex.submit(f_to_thread, *args) for args in zip(*args_to_f)]) 72 | else: 73 | ex = None 74 | iterator_res = map(f_to_thread, *args_to_f) 75 | 76 | try: 77 | if progress_bar: 78 | with get_progbar(iterator_res, total=total) as pbar: 79 | yield pbar 80 | else: 81 | yield iterator_res 82 | finally: 83 | if ex is not None: 84 | ex.shutdown() 85 | -------------------------------------------------------------------------------- /test/content_analyzer/test_config.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from clayrs.content_analyzer.exogenous_properties_retrieval import PropertiesFromDataset 4 | 5 | from clayrs.content_analyzer import FieldConfig, ExogenousConfig 6 | 7 | 8 | class TestFieldConfig(TestCase): 9 | def test_invalid_id(self): 10 | with self.assertRaises(ValueError): 11 | FieldConfig(id='.in.vali.d') 12 | 13 | with self.assertRaises(ValueError): 14 | FieldConfig(id='#in#vali#d') 15 | 16 | with self.assertRaises(ValueError): 17 | FieldConfig(id=' ') 18 | 19 | with self.assertRaises(ValueError): 20 | FieldConfig(id='is invalid') 21 | 22 | with self.assertRaises(ValueError): 23 | FieldConfig(id='is/inva/lid') 24 | 25 | # ...and many more 26 | 27 | def test_valid_id(self): 28 | valid_object = FieldConfig(id='test') 29 | self.assertIsNotNone(valid_object) 30 | 31 | valid_object = FieldConfig(id='test_valid') 32 | self.assertIsNotNone(valid_object) 33 | 34 | valid_object = FieldConfig(id='test-valid') 35 | self.assertIsNotNone(valid_object) 36 | 37 | valid_object = FieldConfig(id='test1-valid2') 38 | self.assertIsNotNone(valid_object) 39 | 40 | valid_object = FieldConfig(id='1_2-3_') 41 | self.assertIsNotNone(valid_object) 42 | 43 | # ...and many more 44 | 45 | 46 | class TestExogenousConfig(TestCase): 47 | def test_invalid_id(self): 48 | with self.assertRaises(ValueError): 49 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d') 50 | 51 | with self.assertRaises(ValueError): 52 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d') 53 | 54 | with self.assertRaises(ValueError): 55 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d') 56 | 57 | with self.assertRaises(ValueError): 58 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d') 59 | 60 | with self.assertRaises(ValueError): 61 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d') 62 | 63 | # ...and many more 64 | 65 | def test_valid_id(self): 66 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test') 67 | self.assertIsNotNone(valid_object) 68 | 69 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test_valid') 70 | self.assertIsNotNone(valid_object) 71 | 72 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test-valid') 73 | self.assertIsNotNone(valid_object) 74 | 75 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test1-valid2') 76 | self.assertIsNotNone(valid_object) 77 | 78 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='1_2-3_') 79 | self.assertIsNotNone(valid_object) 80 | 81 | # ...and many more 82 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/information_processor/visual_preprocessors/torch_builtin_augmenter.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from torchvision import transforms 4 | from torchvision.transforms import InterpolationMode, AutoAugmentPolicy 5 | 6 | from clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_transformer import \ 7 | TorchBuiltInTransformer 8 | 9 | __all__ = [ 10 | "AutoAugmentPolicy", 11 | "TorchAutoAugment", 12 | "TorchRandAugment", 13 | "TorchTrivialAugmentWide" 14 | ] 15 | 16 | 17 | # AUGMENTERS 18 | 19 | class TorchAutoAugment(TorchBuiltInTransformer): 20 | """ 21 | Class that implements the AutoAugment Transformer from torchvision. 22 | The parameters one could pass are the same ones you would pass instantiating 23 | the transformer AutoAugment directly from torchvision. 24 | 25 | TorchVision documentation: [here](https://pytorch.org/vision/main/generated/torchvision.transforms.AutoAugment.html) 26 | 27 | NOTE: the augmented result will SUBSTITUTE the original input 28 | """ 29 | def __init__(self, policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET, 30 | interpolation: InterpolationMode = InterpolationMode.NEAREST, 31 | fill: Optional[List[float]] = None): 32 | 33 | super().__init__(transforms.AutoAugment(policy, interpolation, fill)) 34 | 35 | 36 | class TorchRandAugment(TorchBuiltInTransformer): 37 | """ 38 | Class that implements the RandAugment Transformer from torchvision. 39 | The parameters one could pass are the same ones you would pass instantiating 40 | the transformer RandAugment directly from torchvision. 41 | 42 | TorchVision documentation: [here](https://pytorch.org/vision/main/generated/torchvision.transforms.RandAugment.html) 43 | 44 | NOTE: the augmented result will SUBSTITUTE the original input 45 | """ 46 | def __init__( 47 | self, 48 | num_ops: int = 2, 49 | magnitude: int = 9, 50 | num_magnitude_bins: int = 31, 51 | interpolation: InterpolationMode = InterpolationMode.NEAREST, 52 | fill: Optional[List[float]] = None, 53 | ) -> None: 54 | super().__init__(transforms.RandAugment(num_ops, magnitude, num_magnitude_bins, interpolation, fill)) 55 | 56 | 57 | class TorchTrivialAugmentWide(TorchBuiltInTransformer): 58 | """ 59 | Class that implements the TrivialAugmentWide Transformer from torchvision. 60 | The parameters one could pass are the same ones you would pass instantiating 61 | the transformer TrivialAugmentWide directly from torchvision. 62 | 63 | TorchVision documentation: [here](https://pytorch.org/vision/main/generated/torchvision.transforms.TrivialAugmentWide.html) 64 | 65 | NOTE: the augmented result will SUBSTITUTE the original input 66 | """ 67 | def __init__( 68 | self, 69 | num_magnitude_bins: int = 31, 70 | interpolation: InterpolationMode = InterpolationMode.NEAREST, 71 | fill: Optional[List[float]] = None, 72 | ) -> None: 73 | super().__init__(transforms.TrivialAugmentWide(num_magnitude_bins, interpolation, fill)) 74 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/information_processor/information_processor_abstract.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Any 3 | 4 | import torch 5 | 6 | 7 | class InformationProcessor(ABC): 8 | """ 9 | Abstract class that generalizes data processing. 10 | """ 11 | 12 | @abstractmethod 13 | def process(self, field_data: Any): 14 | raise NotImplementedError 15 | 16 | @abstractmethod 17 | def __eq__(self, other): 18 | raise NotImplementedError 19 | 20 | @abstractmethod 21 | def __str__(self): 22 | raise NotImplementedError 23 | 24 | @abstractmethod 25 | def __repr__(self): 26 | raise NotImplementedError 27 | 28 | 29 | class ImageProcessor(InformationProcessor, torch.nn.Module): 30 | """ 31 | Abstract class for image processing. 32 | """ 33 | @abstractmethod 34 | def forward(self, field_data: torch.Tensor) -> torch.Tensor: 35 | raise NotImplementedError 36 | 37 | def process(self, field_data: torch.Tensor) -> torch.Tensor: 38 | return self.forward(field_data) 39 | 40 | def __eq__(self, other): 41 | return torch.nn.Module.__eq__(self, other) 42 | 43 | 44 | class AudioProcessor(InformationProcessor): 45 | """ 46 | Abstract class for audio processing. 47 | """ 48 | @abstractmethod 49 | def process(self, field_data): 50 | raise NotImplementedError 51 | 52 | 53 | class TextProcessor(InformationProcessor): 54 | """ 55 | Abstract class for raw text processing. 56 | """ 57 | 58 | @staticmethod 59 | def list_to_string(text: List[str]) -> str: 60 | """ 61 | Convert list of str in str 62 | Args: text (str): list of str 63 | Returns: str sentence 64 | """ 65 | string_text = ' '.join([str(elem) for elem in text]) 66 | return string_text 67 | 68 | @staticmethod 69 | def string_to_list(text: str) -> List[str]: 70 | """ 71 | Covert str in list of str 72 | Args: 73 | text (str): str sentence 74 | 75 | Returns List : List of words 76 | """ 77 | list_text = list(text.split(" ")) 78 | return list_text 79 | 80 | @abstractmethod 81 | def process(self, field_data: str): 82 | raise NotImplementedError 83 | 84 | 85 | class NLP(TextProcessor): 86 | """ 87 | Class for processing a text via Natural Language Processing. 88 | 89 | """ 90 | 91 | @abstractmethod 92 | def process(self, field_data: str) -> List[str]: 93 | """ 94 | Apply on the original text the required preprocessing steps 95 | Args: 96 | field_data: text on which NLP with specified phases will be applied 97 | 98 | Returns: 99 | list: The text, after being processed with the specified NLP pipeline, 100 | is splitted in single words that are put into a list. The splitting is executed 101 | even if none of the preprocessing steps is computed. 102 | """ 103 | raise NotImplementedError 104 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_learner/random_indexing.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | import gensim 5 | from gensim.models import RpModel 6 | from gensim.corpora import Dictionary 7 | 8 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimDocumentEmbeddingLearner 9 | from clayrs.content_analyzer.utils.check_tokenization import check_tokenized 10 | 11 | 12 | class GensimRandomIndexing(GensimDocumentEmbeddingLearner): 13 | """ 14 | Class that implements RandomIndexing model thanks to the Gensim library. 15 | 16 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter. 17 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly 18 | represent 19 | 20 | If you'd like to save the model once trained, set the path in the `reference` parameter and set 21 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to 22 | produce contents in the current run 23 | 24 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/rpmodel.html) 25 | to see what else can be customized 26 | 27 | Args: 28 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the 29 | trained model won't be saved after training and will only be used to produce contents in the current run 30 | auto_save: If True, the model will be saved in the path specified in `reference` parameter 31 | """ 32 | 33 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs): 34 | super().__init__(reference, auto_save, ".model", **kwargs) 35 | 36 | def fit_model(self, corpus: List): 37 | dictionary = Dictionary(corpus) 38 | word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus] 39 | self.model = RpModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters) 40 | 41 | def load_model(self): 42 | return RpModel.load(self.reference) 43 | 44 | def get_vector_size(self) -> int: 45 | return self.model.num_topics 46 | 47 | def get_embedding(self, document_tokenized: List[str]) -> np.ndarray: 48 | unseen_doc = self.model.id2word.doc2bow(check_tokenized(document_tokenized)) 49 | 50 | # if document is totally new (no word in train corpus) KeyError is raised 51 | # and load method of embedding source will fill the document vector with zeros 52 | if len(unseen_doc) == 0: 53 | raise KeyError 54 | 55 | sparse_vector = self.model[unseen_doc] 56 | dense_vector = gensim.matutils.sparse2full(sparse_vector, self.model.num_topics) 57 | return dense_vector 58 | 59 | def __str__(self): 60 | return "GensimRandomProjections" 61 | 62 | def __repr__(self): 63 | return f"GensimRandomProjections(reference={self.reference}, auto_save={self._auto_save}, " \ 64 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})" 65 | -------------------------------------------------------------------------------- /clayrs/content_analyzer/embeddings/embedding_learner/latent_semantic_analysis.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import gensim 4 | from gensim.corpora import Dictionary 5 | from gensim.models import LsiModel 6 | 7 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimDocumentEmbeddingLearner 8 | from clayrs.content_analyzer.utils.check_tokenization import check_tokenized 9 | 10 | 11 | class GensimLatentSemanticAnalysis(GensimDocumentEmbeddingLearner): 12 | """ 13 | Class that implements Latent Semantic Analysis (A.K.A. Latent Semantic Indexing) 14 | (LSI) thanks to the Gensim library. 15 | 16 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter. 17 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly 18 | represent 19 | 20 | If you'd like to save the model once trained, set the path in the `reference` parameter and set 21 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to 22 | produce contents in the current run 23 | 24 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/lsimodel.html) 25 | to see what else can be customized 26 | 27 | Args: 28 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the 29 | trained model won't be saved after training and will only be used to produce contents in the current run 30 | auto_save: If True, the model will be saved in the path specified in `reference` parameter 31 | """ 32 | 33 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs): 34 | super().__init__(reference, auto_save, ".model", **kwargs) 35 | 36 | def fit_model(self, corpus: List): 37 | dictionary = Dictionary(corpus) 38 | word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus] 39 | self.model = LsiModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters) 40 | 41 | def load_model(self): 42 | return LsiModel.load(self.reference) 43 | 44 | def get_vector_size(self) -> int: 45 | return self.model.num_topics 46 | 47 | def get_embedding(self, document_tokenized: List[str]): 48 | unseen_doc = self.model.id2word.doc2bow(check_tokenized(document_tokenized)) 49 | 50 | # if document is totally new (no word in train corpus) KeyError is raised 51 | # and load method of embedding source will fill the document vector with zeros 52 | if len(unseen_doc) == 0: 53 | raise KeyError 54 | 55 | sparse_vector = self.model[unseen_doc] 56 | dense_vector = gensim.matutils.sparse2full(sparse_vector, self.model.num_topics) 57 | return dense_vector 58 | 59 | def __str__(self): 60 | return "GensimLatentSemanticAnalysis" 61 | 62 | def __repr__(self): 63 | return f"GensimLatentSemanticAnalysis(reference={self.reference}, auto_save={self._auto_save}, " \ 64 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})" 65 | -------------------------------------------------------------------------------- /test/recsys/graph_based_algorithm/test_graph_based_algorithm.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import pandas as pd 3 | 4 | from clayrs.content_analyzer import Ratings 5 | from clayrs.recsys.graphs.graph import ItemNode, UserNode, PropertyNode 6 | from clayrs.recsys.graphs.nx_implementation.nx_full_graphs import NXFullGraph 7 | 8 | from clayrs.recsys.graph_based_algorithm.page_rank.nx_page_rank import NXPageRank 9 | 10 | 11 | class TestGraphBasedAlgorithm(TestCase): 12 | 13 | def setUp(self) -> None: 14 | ratings = pd.DataFrame.from_records([ 15 | ("A000", "tt0114576", 1, "54654675"), 16 | ("A000", "tt0112453", -0.2, "54654675"), 17 | ("A001", "tt0114576", 0.8, "54654675"), 18 | ("A001", "tt0112896", -0.4, "54654675"), 19 | ("A000", "tt0113041", 0.6, "54654675"), 20 | ("A002", "tt0112453", -0.2, "54654675"), 21 | ("A002", "tt0113497", 0.5, "54654675"), 22 | ("A003", "tt0112453", -0.8, "54654675")], 23 | columns=["from_id", "to_id", "score", "timestamp"]) 24 | ratings = Ratings.from_dataframe(ratings) 25 | 26 | self.graph = NXFullGraph(ratings) 27 | 28 | # GraphBasedAlgorithm is an abstract class, so we instantiate a subclass in order to test its methods 29 | self.alg = NXPageRank() 30 | 31 | def test_filter_result(self): 32 | rank = {UserNode("A000"): 0.5, ItemNode("tt0114576"): 0.5, UserNode("A001"): 0.5, ItemNode("tt0113497"): 0.5, 33 | ItemNode("tt0112453"): 0.5, PropertyNode("Nolan"): 0.5} 34 | 35 | # filter list with item i1, in this case graph parameter and user node parameter won't do anything 36 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=[ItemNode('tt0114576')], 37 | user_node=UserNode("A000")) 38 | expected = {ItemNode("tt0114576"): 0.5} 39 | self.assertEqual(expected, result) 40 | 41 | # filter list with item i1 and item i2, in this case graph parameter and user node parameter won't do anything 42 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=[ItemNode('tt0114576'), 43 | PropertyNode('Nolan')], 44 | user_node=UserNode("A000")) 45 | expected = {ItemNode('tt0114576'): 0.5, PropertyNode("Nolan"): 0.5} 46 | self.assertEqual(expected, result) 47 | 48 | # filter with non existent nodes, result will be empty 49 | # in this case graph parameter and user node parameter won't do anything 50 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=[ItemNode('non_existent')], 51 | user_node=UserNode("A000")) 52 | expected = {} 53 | self.assertEqual(expected, result) 54 | 55 | # clean result for user A000, the cleaned result will have only item nodes 56 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=None, 57 | user_node=UserNode("A000")) 58 | expected = {ItemNode("tt0113497"): 0.5} 59 | 60 | self.assertEqual(expected, result) 61 | -------------------------------------------------------------------------------- /test/recsys/graphs/test_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | 5 | from clayrs.content_analyzer import Ratings 6 | from clayrs.recsys import NXFullGraph 7 | from test import dir_test_files 8 | from unittest import TestCase 9 | 10 | movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/') 11 | users_dir = os.path.join(dir_test_files, 'complex_contents', 'users_codified/') 12 | 13 | rat = pd.DataFrame.from_dict({'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"], 14 | 'to_id': ["tt0112281", "tt0112302", "tt0112281", "tt0112346", 15 | "tt0112453", "tt0112453", "tt0112346", "tt0112453"], 16 | 'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]}) 17 | rat = Ratings.from_dataframe(rat) 18 | 19 | 20 | class TestGraph(TestCase): 21 | def setUp(self) -> None: 22 | # we need to instantiate a sublass of Graph since it's an abstract class to test its methods 23 | self.g: NXFullGraph = NXFullGraph(rat, 24 | item_contents_dir=movies_dir, 25 | item_exo_properties={'dbpedia': ['film director', 26 | 'runtime (m)']}, 27 | 28 | # It's the column in the users .DAT which identifies the gender 29 | user_exo_properties={'local': '1'}, 30 | user_contents_dir=users_dir) 31 | 32 | def test_to_ratings(self): 33 | converted_rat = self.g.to_ratings() 34 | 35 | # check that original ratings and converted ratings are equal 36 | self.assertEqual(set(rat.unique_user_id_column), set(converted_rat.unique_user_id_column)) 37 | self.assertEqual(set(rat.unique_item_id_column), set(converted_rat.unique_item_id_column)) 38 | self.assertEqual(set(rat.score_column), set(converted_rat.score_column)) 39 | self.assertEqual(set(rat.timestamp_column), set(converted_rat.timestamp_column)) 40 | 41 | # compare that for each user, we have same user interactions 42 | for user in rat.user_id_column: 43 | user_rat = rat.get_user_interactions(user) 44 | user_converted_rat = converted_rat.get_user_interactions(user) 45 | 46 | self.assertCountEqual(user_rat, user_converted_rat) 47 | 48 | # user map set, so we expected same user map between expected and result 49 | converted_rat_with_user_map = self.g.to_ratings(user_map=rat.user_map) 50 | self.assertEqual(list(rat.user_map), list(converted_rat_with_user_map.user_map)) 51 | 52 | # item map set, so we expected same item map between expected and result 53 | converted_rat_with_item_map = self.g.to_ratings(item_map=rat.item_map) 54 | self.assertEqual(list(rat.item_map), list(converted_rat_with_item_map.item_map)) 55 | 56 | # user map and item_map set, so we expected them to be equal between expected and result 57 | converted_rat_with_user_item_map = self.g.to_ratings(user_map=rat.user_map, item_map=rat.item_map) 58 | self.assertEqual(list(rat.user_map), list(converted_rat_with_user_item_map.user_map)) 59 | self.assertEqual(list(rat.item_map), list(converted_rat_with_user_item_map.item_map)) 60 | -------------------------------------------------------------------------------- /test/content_analyzer/content_representation/test_representation_container.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import numpy as np 3 | 4 | from clayrs.content_analyzer.content_representation.representation_container import RepresentationContainer 5 | 6 | 7 | class TestRepresentationContainer(TestCase): 8 | 9 | def test_rep_container(self): 10 | rep_container = RepresentationContainer(['rep1', 'rep2', 'rep3'], ['test1', None, 'test3']) 11 | 12 | # tests to check that the indexes and columns of the dataframe in representation container are set as expected 13 | self.assertEqual([0, 1, 2], rep_container.get_internal_index()) 14 | self.assertEqual(['test1', None, 'test3'], rep_container.get_external_index()) 15 | self.assertEqual(['rep1', 'rep2', 'rep3'], rep_container.get_representations()) 16 | 17 | # tests to check that the representation related to the internal_id or external_id passed to rep_container 18 | # is the appropriate representation 19 | for _ in range(15000): 20 | self.assertEqual('rep1', rep_container[0]) 21 | self.assertEqual('rep3', rep_container['test3']) 22 | 23 | # tests to check the correct functionality of the append and remove method 24 | rep_container.append('rep4', 'test4') 25 | self.assertEqual('rep4', rep_container['test4']) 26 | 27 | value_removed = rep_container.pop('test4') 28 | self.assertEqual('rep4', value_removed) 29 | self.assertFalse('rep4' in rep_container.get_representations()) 30 | 31 | # test for empty representation container 32 | empty_rep_container = RepresentationContainer() 33 | self.assertEqual(0, len(empty_rep_container)) 34 | 35 | # test for passing single value to representation container constructor instead of lists 36 | single_rep_container = RepresentationContainer('rep', 'test') 37 | self.assertEqual('rep', single_rep_container['test']) 38 | 39 | # test exception different length of external_id representation lists when passed to the constructor 40 | with self.assertRaises(ValueError): 41 | RepresentationContainer(['rep1', 'rep2'], ['test1']) 42 | 43 | # test exception different length of external_id representation lists when passed to the 'append' method 44 | with self.assertRaises(ValueError): 45 | rep_container.append(['rep1', 'rep2'], ['test1']) 46 | 47 | # test exception representation not present 48 | with self.assertRaises(KeyError): 49 | err = rep_container['not_existent'] 50 | 51 | def test_iter(self): 52 | rep_container = RepresentationContainer(['rep1', 'rep2', 'rep3'], ['test1', None, 'test3']) 53 | 54 | expected_list = [ 55 | {'internal_id': 0, 'external_id': 'test1', 'representation': 'rep1'}, 56 | {'internal_id': 1, 'external_id': None, 'representation': 'rep2'}, 57 | {'internal_id': 2, 'external_id': 'test3', 'representation': 'rep3'} 58 | ] 59 | 60 | it = iter(rep_container) 61 | 62 | self.assertEqual(expected_list[0], next(it)) 63 | self.assertEqual(expected_list[1], next(it)) 64 | self.assertEqual(expected_list[2], next(it)) 65 | 66 | # Check that the iterator gives an error since there aren't any items left 67 | with self.assertRaises(StopIteration): 68 | next(it) 69 | -------------------------------------------------------------------------------- /test/test_files/movies_info_reduced.csv: -------------------------------------------------------------------------------- 1 | "Title","Year","Rated","Released","Runtime","Genre","Director","Writer","Actors","Plot","Language","Country","Awards","Poster","Metascore","imdbRating","imdbVotes","imdbID","Type","DVD","BoxOffice","Production","Website","Response" 2 | "Jumanji","1995","PG","15 Dec 1995","104 min","Adventure, Family, Fantasy","Joe Johnston","Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","English, French","USA","4 wins & 9 nominations.","https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","39","6.9","260,909","tt0113497","movie","25 Jan 2000","N/A","Sony Pictures Home Entertainment","N/A","True" 3 | "Grumpier Old Men","1995","PG-13","22 Dec 1995","101 min","Comedy, Romance","Howard Deutch","Mark Steven Johnson (characters), Mark Steven Johnson","Walter Matthau, Jack Lemmon, Sophia Loren, Ann-Margret","Things don't seem to change much in Wabasha County: Max and John are still fighting after 35 years, Grandpa still drinks, smokes, and chases women , and nobody's been able to catch the fabled ""Catfish Hunter"", a gigantic catfish that actually smiles at fishermen who try to snare it. Six months ago John married the new girl in town (Ariel), and people begin to suspect that Max might be missing something similar in his life. The only joy Max claims is left in his life is fishing, but that might change with the new owner of the bait shop.","English, Italian, German","USA","2 wins & 2 nominations.","https://m.media-amazon.com/images/M/MV5BMjQxM2YyNjMtZjUxYy00OGYyLTg0MmQtNGE2YzNjYmUyZTY1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","46","6.6","21,823","tt0113228","movie","18 Nov 1997","N/A","Warner Home Video","N/A","True" 4 | "Toy Story","1995","G","22 Nov 1995","81 min","Animation, Adventure, Comedy, Family, Fantasy","John Lasseter","John Lasseter (original story by), Pete Docter (original story by), Andrew Stanton (original story by), Joe Ranft (original story by), Joss Whedon (screenplay by), Andrew Stanton (screenplay by), Joel Cohen (screenplay by), Alec Sokolow (screenplay by)","Tom Hanks, Tim Allen, Don Rickles, Jim Varney","A little boy named Andy loves to be in his room, playing with his toys, especially his doll named ""Woody"". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy's family moving, and what Woody does not know is about Andy's birthday party. Woody does not realize that Andy's mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy's new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.","English","USA","Nominated for 3 Oscars. Another 23 wins & 17 nominations.","https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_SX300.jpg","95","8.3","761,649","tt0114709","movie","20 Mar 2001","N/A","Buena Vista","http://www.disney.com/ToyStory","True" 5 | --------------------------------------------------------------------------------