├── .gitignore
├── test
├── recsys
│ ├── __init__.py
│ ├── graphs
│ │ ├── __init__.py
│ │ ├── feature_selection
│ │ │ └── __init__.py
│ │ ├── test_networkx_implementation
│ │ │ └── __init__.py
│ │ └── test_graph.py
│ ├── content_based_algorithm
│ │ ├── __init__.py
│ │ ├── classifier
│ │ │ └── __init__.py
│ │ ├── regressor
│ │ │ └── __init__.py
│ │ ├── centroid_vector
│ │ │ ├── __init__.py
│ │ │ └── test_similarities.py
│ │ ├── index_query
│ │ │ └── __init__.py
│ │ └── test_contents_loader.py
│ ├── graph_based_algorithm
│ │ ├── __init__.py
│ │ ├── page_rank
│ │ │ └── __init__.py
│ │ └── test_graph_based_algorithm.py
│ └── visual_based_algorithm
│ │ ├── __init__.py
│ │ └── vbpr
│ │ └── __init__.py
├── evaluation
│ ├── __init__.py
│ ├── metrics
│ │ └── __init__.py
│ └── eval_pipeline_modules
│ │ └── __init__.py
├── content_analyzer
│ ├── utils
│ │ ├── __init__.py
│ │ ├── test_check_tokenization.py
│ │ └── test_id_merger.py
│ ├── embeddings
│ │ ├── __init__.py
│ │ ├── embedding_learner
│ │ │ ├── __init__.py
│ │ │ ├── test_doc2vec.py
│ │ │ ├── test_word2vec.py
│ │ │ ├── test_fasttext.py
│ │ │ ├── test_random_indexing.py
│ │ │ ├── test_lda.py
│ │ │ └── test_latent_semantic_analysis.py
│ │ ├── embedding_loader
│ │ │ ├── __init__.py
│ │ │ ├── test_sbert.py
│ │ │ └── test_gensim_loader.py
│ │ └── test_embedding_source.py
│ ├── memory_interfaces
│ │ └── __init__.py
│ ├── ratings_manager
│ │ ├── __init__.py
│ │ ├── test_sentiment_analysis.py
│ │ └── test_rating_processor.py
│ ├── information_processor
│ │ ├── __init__.py
│ │ ├── test_visualpostprocessors
│ │ │ └── __init__.py
│ │ └── test_visualpreprocessors
│ │ │ ├── __init__.py
│ │ │ └── test_torch_builtin_augmenter.py
│ ├── field_content_production_techniques
│ │ ├── __init__.py
│ │ ├── visual_technique
│ │ │ └── __init__.py
│ │ ├── embedding_technique
│ │ │ ├── __init__.py
│ │ │ └── test_combining_technique.py
│ │ ├── test_int.json
│ │ ├── test_synset_document_frequency.py
│ │ └── test_tf_idf.py
│ ├── __init__.py
│ ├── content_representation
│ │ ├── __init__.py
│ │ └── test_representation_container.py
│ └── test_config.py
├── test_files
│ ├── complex_contents
│ │ ├── index
│ │ │ ├── MAIN_WRITELOCK
│ │ │ ├── _MAIN_2.toc
│ │ │ └── MAIN_hlk83r2qer820iyx.seg
│ │ ├── users_codified
│ │ │ ├── 1.xz
│ │ │ ├── 10.xz
│ │ │ ├── 11.xz
│ │ │ ├── 12.xz
│ │ │ ├── 13.xz
│ │ │ ├── 14.xz
│ │ │ ├── 15.xz
│ │ │ ├── 16.xz
│ │ │ ├── 17.xz
│ │ │ ├── 18.xz
│ │ │ ├── 19.xz
│ │ │ ├── 2.xz
│ │ │ ├── 20.xz
│ │ │ ├── 21.xz
│ │ │ ├── 22.xz
│ │ │ ├── 23.xz
│ │ │ ├── 24.xz
│ │ │ ├── 25.xz
│ │ │ ├── 26.xz
│ │ │ ├── 27.xz
│ │ │ ├── 28.xz
│ │ │ ├── 29.xz
│ │ │ ├── 3.xz
│ │ │ ├── 30.xz
│ │ │ ├── 31.xz
│ │ │ ├── 32.xz
│ │ │ ├── 33.xz
│ │ │ ├── 34.xz
│ │ │ ├── 35.xz
│ │ │ ├── 36.xz
│ │ │ ├── 37.xz
│ │ │ ├── 38.xz
│ │ │ ├── 39.xz
│ │ │ ├── 4.xz
│ │ │ ├── 40.xz
│ │ │ ├── 41.xz
│ │ │ ├── 42.xz
│ │ │ ├── 43.xz
│ │ │ ├── 44.xz
│ │ │ ├── 45.xz
│ │ │ ├── 46.xz
│ │ │ ├── 47.xz
│ │ │ ├── 48.xz
│ │ │ ├── 49.xz
│ │ │ ├── 5.xz
│ │ │ ├── 50.xz
│ │ │ ├── 51.xz
│ │ │ ├── 52.xz
│ │ │ ├── 53.xz
│ │ │ ├── 54.xz
│ │ │ ├── 55.xz
│ │ │ ├── 56.xz
│ │ │ ├── 57.xz
│ │ │ ├── 58.xz
│ │ │ ├── 59.xz
│ │ │ ├── 6.xz
│ │ │ ├── 60.xz
│ │ │ ├── 61.xz
│ │ │ ├── 62.xz
│ │ │ ├── 63.xz
│ │ │ ├── 64.xz
│ │ │ ├── 65.xz
│ │ │ ├── 66.xz
│ │ │ ├── 67.xz
│ │ │ ├── 68.xz
│ │ │ ├── 69.xz
│ │ │ ├── 7.xz
│ │ │ ├── 70.xz
│ │ │ ├── 8.xz
│ │ │ └── 9.xz
│ │ ├── movies_codified
│ │ │ ├── tt0112281.xz
│ │ │ ├── tt0112302.xz
│ │ │ ├── tt0112346.xz
│ │ │ ├── tt0112453.xz
│ │ │ ├── tt0112641.xz
│ │ │ ├── tt0112760.xz
│ │ │ ├── tt0112896.xz
│ │ │ ├── tt0113041.xz
│ │ │ ├── tt0113101.xz
│ │ │ ├── tt0113189.xz
│ │ │ ├── tt0113228.xz
│ │ │ ├── tt0113277.xz
│ │ │ ├── tt0113497.xz
│ │ │ ├── tt0113845.xz
│ │ │ ├── tt0113987.xz
│ │ │ ├── tt0114319.xz
│ │ │ ├── tt0114388.xz
│ │ │ ├── tt0114576.xz
│ │ │ ├── tt0114709.xz
│ │ │ └── tt0114885.xz
│ │ └── create_complex_contents.py
│ ├── test_embedding_models
│ │ ├── ri_model.model
│ │ ├── doc2vec_model.kv
│ │ ├── fasttext_model.kv
│ │ ├── word2vec_model.kv
│ │ └── lsa
│ │ │ ├── lsa_model.model
│ │ │ └── lsa_model.model.projection
│ ├── random_tsv.tsv
│ ├── test_images
│ │ ├── images_files
│ │ │ ├── o-neill-dress-black-and-white-164-1.jpg
│ │ │ ├── wildfox-floral-print-leggings-357-1.jpg
│ │ │ ├── anthropologie-skirt-light-pink-434-1.jpg
│ │ │ ├── haute-hippie-top-white-and-black-1015-1.jpg
│ │ │ └── elizabeth-and-james-top-ecru-and-pink-81-1.jpg
│ │ ├── tradesy_small_local_relative_paths.json
│ │ └── tradesy_small_online.json
│ ├── d2v_test_data.json
│ ├── users_info.json
│ ├── test_dbpedia
│ │ └── movies_info_reduced.json
│ ├── test_ratings
│ │ └── ratings_1591277020.csv
│ ├── test_import_ratings.json
│ ├── test_decode
│ │ ├── movies_title_string.json
│ │ ├── movies_title_tfidf.json
│ │ └── movies_title_embedding.json
│ ├── users_70.dat
│ └── movies_info_reduced.csv
├── utils
│ ├── __init__.py
│ ├── test_load_content.py
│ ├── test_class_utils.py
│ ├── test_context_managers.py
│ └── test_automatic_methods.py
└── __init__.py
├── MANIFEST.in
├── clayrs
├── content_analyzer
│ ├── utils
│ │ ├── __init__.py
│ │ ├── check_tokenization.py
│ │ └── id_merger.py
│ ├── content_representation
│ │ └── __init__.py
│ ├── information_processor
│ │ ├── postprocessors
│ │ │ └── __init__.py
│ │ ├── visual_preprocessors
│ │ │ ├── __init__.py
│ │ │ └── torch_builtin_augmenter.py
│ │ ├── __init__.py
│ │ └── information_processor_abstract.py
│ ├── memory_interfaces
│ │ └── __init__.py
│ ├── embeddings
│ │ ├── __init__.py
│ │ ├── embedding_loader
│ │ │ ├── __init__.py
│ │ │ ├── gensim.py
│ │ │ ├── sbert.py
│ │ │ └── vector_strategy.py
│ │ └── embedding_learner
│ │ │ ├── __init__.py
│ │ │ ├── word2vec.py
│ │ │ ├── fasttext.py
│ │ │ ├── doc2vec.py
│ │ │ ├── lda.py
│ │ │ ├── random_indexing.py
│ │ │ └── latent_semantic_analysis.py
│ ├── ratings_manager
│ │ ├── __init__.py
│ │ ├── sentiment_analysis.py
│ │ └── score_processor.py
│ ├── field_content_production_techniques
│ │ ├── visual_techniques
│ │ │ └── __init__.py
│ │ ├── embedding_technique
│ │ │ └── __init__.py
│ │ └── __init__.py
│ ├── __init__.py
│ └── exceptions.py
├── evaluation
│ ├── eval_pipeline_modules
│ │ └── __init__.py
│ ├── exceptions.py
│ ├── __init__.py
│ └── metrics
│ │ ├── __init__.py
│ │ └── metrics.py
├── recsys
│ ├── visual_based_algorithm
│ │ ├── __init__.py
│ │ └── vbpr
│ │ │ └── __init__.py
│ ├── graph_based_algorithm
│ │ ├── page_rank
│ │ │ └── __init__.py
│ │ └── __init__.py
│ ├── content_based_algorithm
│ │ ├── index_query
│ │ │ └── __init__.py
│ │ ├── centroid_vector
│ │ │ ├── __init__.py
│ │ │ └── similarities.py
│ │ ├── classifier
│ │ │ └── __init__.py
│ │ ├── regressor
│ │ │ └── __init__.py
│ │ ├── __init__.py
│ │ └── exceptions.py
│ ├── graphs
│ │ ├── __init__.py
│ │ ├── feature_selection
│ │ │ ├── exceptions.py
│ │ │ └── __init__.py
│ │ ├── nx_implementation
│ │ │ └── __init__.py
│ │ └── graph_metrics.py
│ ├── __init__.py
│ └── algorithm.py
├── __init__.py
└── utils
│ ├── __init__.py
│ ├── const.py
│ ├── automatic_methods.py
│ ├── custom_logger.py
│ ├── class_utils.py
│ ├── load_content.py
│ └── context_managers.py
├── setup.cfg
├── pyproject.toml
├── docs
└── mkdocs
│ ├── requirements-doc.txt
│ └── docs
│ ├── evaluation
│ ├── eval_model.md
│ ├── metrics
│ │ ├── plot_metrics.md
│ │ ├── error_metrics.md
│ │ ├── ranking_metrics.md
│ │ ├── fairness_metrics.md
│ │ └── classification_metrics.md
│ └── statistical_tests
│ │ └── paired.md
│ ├── img
│ ├── colab_examples_1.png
│ └── colab_examples_2.png
│ ├── content_analyzer
│ ├── index_interface.md
│ ├── content_techniques
│ │ ├── from_npy.md
│ │ ├── textual_techniques
│ │ │ ├── original_data.md
│ │ │ ├── synset_df_frequency.md
│ │ │ ├── tfidf.md
│ │ │ └── embedding_techniques
│ │ │ │ ├── document_embeddings.md
│ │ │ │ ├── sentence_embeddings.md
│ │ │ │ ├── contextualized_embeddings.md
│ │ │ │ ├── word_embeddings.md
│ │ │ │ └── combining_embeddings.md
│ │ └── visual_techniques
│ │ │ ├── high_level_visual.md
│ │ │ └── low_level_visual.md
│ ├── raw_sources.md
│ ├── information_preprocessors
│ │ ├── textual_preprocessors
│ │ │ ├── nltk.md
│ │ │ ├── spacy.md
│ │ │ └── ekphrasis.md
│ │ ├── visual_preprocessors
│ │ │ └── torch_preprocessors.md
│ │ └── postprocessors
│ │ │ └── postprocessor.md
│ ├── exogenous_techniques
│ │ ├── babelfy.md
│ │ ├── dbpedia.md
│ │ └── properties_from_dataset.md
│ ├── ratings
│ │ ├── ratings.md
│ │ └── score_processors.md
│ └── config.md
│ ├── utils
│ └── report.md
│ ├── recsys
│ ├── methodology
│ │ ├── all_items.md
│ │ ├── test_items.md
│ │ ├── test_ratings.md
│ │ ├── training_items.md
│ │ └── abstract_methodology.md
│ ├── graph_based
│ │ ├── graph_based_recsys.md
│ │ ├── graph_based_algorithms
│ │ │ └── nx_pagerank.md
│ │ ├── graphs
│ │ │ ├── nx_bipartite.md
│ │ │ ├── nx_full.md
│ │ │ ├── nx_tripartite.md
│ │ │ └── nodes.md
│ │ └── feature_selection.md
│ ├── partitioning
│ │ ├── kfold.md
│ │ ├── bootstrap.md
│ │ ├── hold_out.md
│ │ └── abstract_partitioning.md
│ ├── content_based
│ │ ├── content_based_recsys.md
│ │ ├── content_based_algorithms
│ │ │ ├── index_query.md
│ │ │ ├── linear_predictor.md
│ │ │ ├── centroid_vector.md
│ │ │ └── classifier_recommender.md
│ │ └── visual_based_algorithms
│ │ │ └── vbpr.md
│ └── experiment.md
│ ├── javascripts
│ └── mathjax.js
│ ├── first_steps
│ ├── installation.md
│ └── colab_examples.md
│ └── index.md
├── codecov.yml
├── datasets
└── ml-100k_extra_small
│ ├── users_extra_small.csv
│ └── ratings_extra_small.csv
├── .gitattributes
├── .coveragerc
├── .github
├── ISSUE_TEMPLATE
│ ├── feature_request.md
│ └── bug_report.md
└── workflows
│ ├── docs_building.yml
│ └── testing_pipeline.yml
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 |
--------------------------------------------------------------------------------
/test/recsys/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/graphs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/evaluation/metrics/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/content_based_algorithm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/graph_based_algorithm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/graphs/feature_selection/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/visual_based_algorithm/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/memory_interfaces/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/ratings_manager/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/evaluation/eval_pipeline_modules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/visual_based_algorithm/vbpr/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/test_files/complex_contents/index/MAIN_WRITELOCK:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/clayrs/evaluation/eval_pipeline_modules/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 |
--------------------------------------------------------------------------------
/test/content_analyzer/information_processor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/content_based_algorithm/classifier/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/content_based_algorithm/regressor/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/graph_based_algorithm/page_rank/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_learner/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_loader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/content_based_algorithm/centroid_vector/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/content_based_algorithm/index_query/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/recsys/graphs/test_networkx_implementation/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/clayrs/recsys/visual_based_algorithm/__init__.py:
--------------------------------------------------------------------------------
1 | from .vbpr import *
--------------------------------------------------------------------------------
/test/content_analyzer/field_content_production_techniques/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "Cython"]
--------------------------------------------------------------------------------
/test/content_analyzer/information_processor/test_visualpostprocessors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/information_processor/test_visualpreprocessors/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/content_representation/__init__.py:
--------------------------------------------------------------------------------
1 | from .content import Content
--------------------------------------------------------------------------------
/test/content_analyzer/field_content_production_techniques/visual_technique/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/clayrs/recsys/visual_based_algorithm/vbpr/__init__.py:
--------------------------------------------------------------------------------
1 | from .vbpr_algorithm import VBPR
2 |
--------------------------------------------------------------------------------
/test/content_analyzer/field_content_production_techniques/embedding_technique/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/content_analyzer/field_content_production_techniques/test_int.json:
--------------------------------------------------------------------------------
1 | [{"Int_field": 50}]
--------------------------------------------------------------------------------
/clayrs/recsys/graph_based_algorithm/page_rank/__init__.py:
--------------------------------------------------------------------------------
1 | from .nx_page_rank import NXPageRank
2 |
--------------------------------------------------------------------------------
/clayrs/recsys/content_based_algorithm/index_query/__init__.py:
--------------------------------------------------------------------------------
1 | from .index_query import IndexQuery
2 |
--------------------------------------------------------------------------------
/clayrs/__init__.py:
--------------------------------------------------------------------------------
1 | import nest_asyncio
2 | nest_asyncio.apply() # fix IPython multiprocessing error
3 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/information_processor/postprocessors/__init__.py:
--------------------------------------------------------------------------------
1 | from .postprocessor import *
2 |
--------------------------------------------------------------------------------
/clayrs/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .load_content import load_content_instance
2 | from .report import Report
3 |
--------------------------------------------------------------------------------
/docs/mkdocs/requirements-doc.txt:
--------------------------------------------------------------------------------
1 | mkdocs-material~=9.0.15
2 | mkdocstrings-python~=0.8.3
3 | griffe~=0.25.5
--------------------------------------------------------------------------------
/clayrs/content_analyzer/memory_interfaces/__init__.py:
--------------------------------------------------------------------------------
1 | from .text_interface import KeywordIndex, SearchIndex
2 |
--------------------------------------------------------------------------------
/clayrs/recsys/graph_based_algorithm/__init__.py:
--------------------------------------------------------------------------------
1 | from . import page_rank
2 |
3 | from .page_rank import *
4 |
5 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/evaluation/eval_model.md:
--------------------------------------------------------------------------------
1 | # Eval Model class
2 |
3 | ::: clayrs.evaluation.eval_model
4 | handler: python
5 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/img/colab_examples_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/docs/mkdocs/docs/img/colab_examples_1.png
--------------------------------------------------------------------------------
/docs/mkdocs/docs/img/colab_examples_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/docs/mkdocs/docs/img/colab_examples_2.png
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 | status:
3 | project:
4 | default:
5 | target: auto
6 | threshold: 3%
7 | patch: off
8 |
--------------------------------------------------------------------------------
/test/utils/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../clayrs/')))
4 |
--------------------------------------------------------------------------------
/datasets/ml-100k_extra_small/users_extra_small.csv:
--------------------------------------------------------------------------------
1 | user_id,age,gender,occupation,zip_code
2 | 1,24,M,technician,85711
3 | 20,42,F,homemaker,95660
4 |
--------------------------------------------------------------------------------
/test/content_analyzer/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.insert(0, os.path.abspath( os.path.join(os.path.dirname(__file__), '../clayrs/')))
--------------------------------------------------------------------------------
/test/test_files/complex_contents/index/_MAIN_2.toc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/index/_MAIN_2.toc
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/1.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/1.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/10.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/10.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/11.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/11.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/12.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/12.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/13.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/13.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/14.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/14.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/15.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/15.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/16.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/16.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/17.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/17.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/18.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/18.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/19.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/19.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/2.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/2.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/20.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/20.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/21.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/21.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/22.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/22.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/23.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/23.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/24.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/24.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/25.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/25.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/26.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/26.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/27.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/27.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/28.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/28.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/29.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/29.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/3.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/3.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/30.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/30.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/31.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/31.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/32.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/32.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/33.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/33.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/34.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/34.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/35.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/35.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/36.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/36.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/37.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/37.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/38.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/38.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/39.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/39.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/4.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/4.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/40.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/40.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/41.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/41.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/42.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/42.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/43.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/43.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/44.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/44.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/45.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/45.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/46.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/46.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/47.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/47.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/48.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/48.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/49.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/49.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/5.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/5.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/50.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/50.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/51.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/51.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/52.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/52.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/53.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/53.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/54.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/54.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/55.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/55.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/56.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/56.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/57.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/57.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/58.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/58.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/59.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/59.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/6.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/6.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/60.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/60.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/61.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/61.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/62.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/62.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/63.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/63.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/64.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/64.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/65.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/65.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/66.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/66.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/67.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/67.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/68.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/68.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/69.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/69.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/7.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/7.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/70.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/70.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/8.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/8.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/users_codified/9.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/users_codified/9.xz
--------------------------------------------------------------------------------
/test/test_files/test_embedding_models/ri_model.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/ri_model.model
--------------------------------------------------------------------------------
/clayrs/recsys/content_based_algorithm/centroid_vector/__init__.py:
--------------------------------------------------------------------------------
1 | from .centroid_vector import CentroidVector
2 | from .similarities import CosineSimilarity
3 |
--------------------------------------------------------------------------------
/test/test_files/test_embedding_models/doc2vec_model.kv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/doc2vec_model.kv
--------------------------------------------------------------------------------
/test/test_files/test_embedding_models/fasttext_model.kv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/fasttext_model.kv
--------------------------------------------------------------------------------
/test/test_files/test_embedding_models/word2vec_model.kv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/word2vec_model.kv
--------------------------------------------------------------------------------
/clayrs/content_analyzer/information_processor/visual_preprocessors/__init__.py:
--------------------------------------------------------------------------------
1 | from .torch_builtin_transformer import *
2 | from .torch_builtin_augmenter import *
3 |
--------------------------------------------------------------------------------
/clayrs/recsys/graphs/__init__.py:
--------------------------------------------------------------------------------
1 | from .graph import UserNode, ItemNode, PropertyNode
2 |
3 | from .nx_implementation import *
4 | from .feature_selection import *
5 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/index_interface.md:
--------------------------------------------------------------------------------
1 | # Index interface
2 |
3 | ::: clayrs.content_analyzer.memory_interfaces.text_interface
4 | handler: python
5 |
--------------------------------------------------------------------------------
/test/test_files/random_tsv.tsv:
--------------------------------------------------------------------------------
1 | listen improve differ
2 | visitor meant kind
3 | basis climb honor
4 | simple vote closer
5 | blind finger pencil
6 | clock energy shape
--------------------------------------------------------------------------------
/test/test_files/test_embedding_models/lsa/lsa_model.model:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/lsa/lsa_model.model
--------------------------------------------------------------------------------
/test/content_analyzer/content_representation/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | sys.path.insert(0, os.path.abspath( os.path.join(os.path.dirname(__file__), '../clayrs/')))
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0112281.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112281.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0112302.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112302.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0112346.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112346.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0112453.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112453.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0112641.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112641.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0112760.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112760.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0112896.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0112896.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113041.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113041.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113101.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113101.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113189.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113189.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113228.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113228.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113277.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113277.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113497.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113497.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113845.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113845.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0113987.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0113987.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0114319.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114319.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0114388.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114388.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0114576.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114576.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0114709.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114709.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/movies_codified/tt0114885.xz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/movies_codified/tt0114885.xz
--------------------------------------------------------------------------------
/test/test_files/complex_contents/index/MAIN_hlk83r2qer820iyx.seg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/complex_contents/index/MAIN_hlk83r2qer820iyx.seg
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py linguist-detectable=true
2 | *.xml linguist-detectable=false
3 | *.html linguist-detectable=false
4 | *.js linguist-detectable=false
5 | *.css linguist-detectable=false
--------------------------------------------------------------------------------
/test/test_files/test_embedding_models/lsa/lsa_model.model.projection:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_embedding_models/lsa/lsa_model.model.projection
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | from . import embedding_learner
2 | from . import embedding_loader
3 |
4 | from .embedding_learner import *
5 | from .embedding_loader import *
6 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/utils/report.md:
--------------------------------------------------------------------------------
1 | # Report class
2 |
3 | ::: clayrs.utils.Report
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/clayrs/evaluation/exceptions.py:
--------------------------------------------------------------------------------
1 | class NotEnoughUsers(Exception):
2 | """
3 | Exception to raise when DeltaGap tries to split n_users in n_groups but n_users < n_groups
4 | """
5 | pass
6 |
--------------------------------------------------------------------------------
/clayrs/recsys/graphs/feature_selection/exceptions.py:
--------------------------------------------------------------------------------
1 | class FeatureSelectionException(Exception):
2 | """
3 | Generic exception used inside the FeatureSelectionAlgorithm
4 | """
5 | pass
6 |
--------------------------------------------------------------------------------
/clayrs/recsys/graphs/nx_implementation/__init__.py:
--------------------------------------------------------------------------------
1 | from .nx_bipartite_graphs import NXBipartiteGraph
2 | from .nx_tripartite_graphs import NXTripartiteGraph
3 | from .nx_full_graphs import NXFullGraph
4 |
--------------------------------------------------------------------------------
/test/test_files/test_images/images_files/o-neill-dress-black-and-white-164-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/o-neill-dress-black-and-white-164-1.jpg
--------------------------------------------------------------------------------
/test/test_files/test_images/images_files/wildfox-floral-print-leggings-357-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/wildfox-floral-print-leggings-357-1.jpg
--------------------------------------------------------------------------------
/clayrs/content_analyzer/ratings_manager/__init__.py:
--------------------------------------------------------------------------------
1 | from .score_processor import NumberNormalizer
2 | from .ratings import Ratings, Rank, Prediction
3 | from .sentiment_analysis import TextBlobSentimentAnalysis
4 |
--------------------------------------------------------------------------------
/test/test_files/test_images/images_files/anthropologie-skirt-light-pink-434-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/anthropologie-skirt-light-pink-434-1.jpg
--------------------------------------------------------------------------------
/datasets/ml-100k_extra_small/ratings_extra_small.csv:
--------------------------------------------------------------------------------
1 | user_id,item_id,rating,timestamp
2 | 1,61,4,878542420
3 | 1,189,3,888732928
4 | 1,33,4,878542699
5 | 20,288,1,879667584
6 | 20,208,2,879669401
7 | 20,11,2,879669401
8 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | dir_test_files = os.path.join(os.path.dirname(__file__), 'test_files')
5 | dir_root_repo = Path(os.path.join(os.path.dirname(__file__), '..')).resolve()
6 |
--------------------------------------------------------------------------------
/test/test_files/test_images/images_files/haute-hippie-top-white-and-black-1015-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/haute-hippie-top-white-and-black-1015-1.jpg
--------------------------------------------------------------------------------
/test/test_files/test_images/images_files/elizabeth-and-james-top-ecru-and-pink-81-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/swapUniba/ClayRS/HEAD/test/test_files/test_images/images_files/elizabeth-and-james-top-ecru-and-pink-81-1.jpg
--------------------------------------------------------------------------------
/clayrs/recsys/content_based_algorithm/classifier/__init__.py:
--------------------------------------------------------------------------------
1 | from .classifier_recommender import ClassifierRecommender
2 | from .classifiers import SkSVC, SkKNN, SkRandomForest, SkLogisticRegression, SkDecisionTree, SkGaussianProcess
3 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/methodology/all_items.md:
--------------------------------------------------------------------------------
1 | # All Items methodology
2 |
3 | ::: clayrs.recsys.AllItemsMethodology
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/graph_based/graph_based_recsys.md:
--------------------------------------------------------------------------------
1 | # Graph Based RecSys
2 |
3 | ::: clayrs.recsys.recsys.GraphBasedRS
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/methodology/test_items.md:
--------------------------------------------------------------------------------
1 | # Test Items methodology
2 |
3 | ::: clayrs.recsys.TestItemsMethodology
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/partitioning/kfold.md:
--------------------------------------------------------------------------------
1 | # KFold partitioning technique
2 |
3 | ::: clayrs.recsys.KFoldPartitioning
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/from_npy.md:
--------------------------------------------------------------------------------
1 | # Import from NPY
2 |
3 | ::: clayrs.content_analyzer.FromNPY
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/methodology/test_ratings.md:
--------------------------------------------------------------------------------
1 | # Test Ratings methodology
2 |
3 | ::: clayrs.recsys.TestRatingsMethodology
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/partitioning/bootstrap.md:
--------------------------------------------------------------------------------
1 | # KFold partitioning technique
2 |
3 | ::: clayrs.recsys.BootstrapPartitioning
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/partitioning/hold_out.md:
--------------------------------------------------------------------------------
1 | # HoldOut partitioning technique
2 |
3 | ::: clayrs.recsys.HoldOutPartitioning
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [report]
2 | exclude_lines =
3 | pragma: no cover
4 | def __repr__
5 | def __str__
6 | def __init__
7 | raise NotImplementedError
8 | pass
9 | if __name__ == .__main__.:
10 | from
11 | import
12 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/content_based/content_based_recsys.md:
--------------------------------------------------------------------------------
1 | # Content Based RecSys
2 |
3 | ::: clayrs.recsys.recsys.ContentBasedRS
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/methodology/training_items.md:
--------------------------------------------------------------------------------
1 | # Training Items methodology
2 |
3 | ::: clayrs.recsys.TrainingItemsMethodology
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_loader/__init__.py:
--------------------------------------------------------------------------------
1 | from .gensim import Gensim
2 | from .transformer import Transformers, BertTransformers, T5Transformers
3 | from .sbert import Sbert
4 | from .vector_strategy import SumStrategy, CatStrategy
5 |
--------------------------------------------------------------------------------
/clayrs/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from . import eval_pipeline_modules
2 | from . import metrics
3 |
4 | from .eval_pipeline_modules import *
5 | from .metrics import *
6 | from .eval_model import EvalModel
7 | from .statistical_test import Ttest, Wilcoxon
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/methodology/abstract_methodology.md:
--------------------------------------------------------------------------------
1 | # Abstract methodology class
2 |
3 | ::: clayrs.recsys.methodology.Methodology
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/clayrs/recsys/graphs/feature_selection/__init__.py:
--------------------------------------------------------------------------------
1 | from .feature_selection_alg import TopKPageRank, TopKDegreeCentrality, TopKEigenVectorCentrality
2 | from .feature_selection_fn import feature_selector
3 | from .exceptions import FeatureSelectionException
4 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/partitioning/abstract_partitioning.md:
--------------------------------------------------------------------------------
1 | # Abstract Partitioning class
2 |
3 | ::: clayrs.recsys.partitioning.Partitioning
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/information_processor/__init__.py:
--------------------------------------------------------------------------------
1 | from .nltk_processor import NLTK
2 | from .spacy_processor import Spacy
3 | from .ekphrasis_processor import Ekphrasis
4 |
5 |
6 | from .visual_preprocessors import *
7 | from .postprocessors import *
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/raw_sources.md:
--------------------------------------------------------------------------------
1 | # Raw Source Wrappers
2 |
3 | ::: clayrs.content_analyzer.raw_information_source
4 | handler: python
5 | options:
6 | filters:
7 | - "!^_[^_]"
8 | - "!^RawInformationSource$"
9 |
--------------------------------------------------------------------------------
/clayrs/recsys/content_based_algorithm/regressor/__init__.py:
--------------------------------------------------------------------------------
1 | from .linear_predictor import LinearPredictor
2 | from .regressors import SkLinearRegression, SkRidge, SkBayesianRidge, SkSGDRegressor, SkARDRegression,\
3 | SkHuberRegressor, SkPassiveAggressiveRegressor
4 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/information_preprocessors/textual_preprocessors/nltk.md:
--------------------------------------------------------------------------------
1 | # NLTK Preprocessor
2 |
3 | ::: clayrs.content_analyzer.NLTK
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/information_preprocessors/textual_preprocessors/spacy.md:
--------------------------------------------------------------------------------
1 | # Spacy preprocessor
2 |
3 | ::: clayrs.content_analyzer.Spacy
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/exogenous_techniques/babelfy.md:
--------------------------------------------------------------------------------
1 | # Properties from DBPedia ontology
2 |
3 | ::: clayrs.content_analyzer.BabelPyEntityLinking
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/content_based/content_based_algorithms/index_query.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ::: clayrs.recsys.content_based_algorithm.index_query.index_query.IndexQuery
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/exogenous_techniques/dbpedia.md:
--------------------------------------------------------------------------------
1 | # Properties from DBPedia ontology
2 |
3 | ::: clayrs.content_analyzer.DBPediaMappingTechnique
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/content_based/visual_based_algorithms/vbpr.md:
--------------------------------------------------------------------------------
1 | # Centroid Vector
2 |
3 | ::: clayrs.recsys.visual_based_algorithm.vbpr.vbpr_algorithm.VBPR
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/information_preprocessors/textual_preprocessors/ekphrasis.md:
--------------------------------------------------------------------------------
1 | # Ekphrasis Preprocessor
2 |
3 | ::: clayrs.content_analyzer.Ekphrasis
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/graph_based/graph_based_algorithms/nx_pagerank.md:
--------------------------------------------------------------------------------
1 | # Page Rank
2 |
3 |
4 | ::: clayrs.recsys.graph_based_algorithm.page_rank.nx_page_rank.NXPageRank
5 | handler: python
6 | options:
7 | show_root_toc_entry: true
8 | show_root_heading: true
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/graph_based/graphs/nx_bipartite.md:
--------------------------------------------------------------------------------
1 | # Bipartite Graph
2 |
3 | ::: clayrs.recsys.graphs.nx_implementation.nx_bipartite_graphs.NXBipartiteGraph
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/exogenous_techniques/properties_from_dataset.md:
--------------------------------------------------------------------------------
1 | # Properties from local dataset
2 |
3 | ::: clayrs.content_analyzer.PropertiesFromDataset
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/test/test_files/d2v_test_data.json:
--------------------------------------------------------------------------------
1 | [{"id_field":"01","doc_field":"I love machine learning. Its awesome."},
2 | {"id_field":"02","doc_field":"I love coding in python"},
3 | {"id_field":"03","doc_field":"I love building chatbots"},
4 | {"id_field":"04","doc_field":"they chat amagingly well"}]
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/original_data.md:
--------------------------------------------------------------------------------
1 | # Original Data
2 |
3 | ::: clayrs.content_analyzer.OriginalData
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 | members: none
9 |
--------------------------------------------------------------------------------
/clayrs/recsys/content_based_algorithm/__init__.py:
--------------------------------------------------------------------------------
1 | from . import centroid_vector
2 | from . import classifier
3 | from . import index_query
4 | from . import regressor
5 |
6 | from .centroid_vector import *
7 | from .classifier import *
8 | from .index_query import *
9 | from .regressor import *
10 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/synset_df_frequency.md:
--------------------------------------------------------------------------------
1 | # Synset Document Frequency
2 |
3 | ::: clayrs.content_analyzer.PyWSDSynsetDocumentFrequency
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/evaluation/metrics/plot_metrics.md:
--------------------------------------------------------------------------------
1 | # Plot metrics
2 |
3 | Plot metrics save a plot in the chosen output directory
4 |
5 | ::: clayrs.evaluation.metrics.plot_metrics
6 | handler: python
7 | options:
8 | filters:
9 | - "!^_[^_]"
10 | - "!^PlotMetric$"
11 | - "!.*def.*"
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_learner/__init__.py:
--------------------------------------------------------------------------------
1 | from .doc2vec import GensimDoc2Vec
2 | from .fasttext import GensimFastText
3 | from .latent_semantic_analysis import GensimLatentSemanticAnalysis
4 | from .random_indexing import GensimRandomIndexing
5 | from .word2vec import GensimWord2Vec
6 | from .lda import GensimLDA
7 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/field_content_production_techniques/visual_techniques/__init__.py:
--------------------------------------------------------------------------------
1 | from .low_level_techniques import SkImageHogDescriptor, SkImageCannyEdgeDetector, SkImageSIFT, ColorsHist, \
2 | SkImageLBP, ColorQuantization, CustomFilterConvolution
3 | from .high_level_techniques import PytorchImageModels, CaffeImageModels
4 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/evaluation/metrics/error_metrics.md:
--------------------------------------------------------------------------------
1 | # Error metrics
2 |
3 | Error metrics evaluate 'how wrong' the recommender system was in predicting a rating
4 |
5 | ::: clayrs.evaluation.metrics.error_metrics
6 | handler: python
7 | options:
8 | filters:
9 | - "!^_[^_]"
10 | - "!^ErrorMetric$"
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/visual_techniques/high_level_visual.md:
--------------------------------------------------------------------------------
1 | # High level techniques
2 |
3 | ::: clayrs.content_analyzer.field_content_production_techniques.visual_techniques.high_level_techniques
4 | handler: python
5 | options:
6 | heading_level: 3
7 | filters:
8 | - '!^HighLevelVisual$'
9 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/visual_techniques/low_level_visual.md:
--------------------------------------------------------------------------------
1 | # Low level techniques
2 |
3 | ::: clayrs.content_analyzer.field_content_production_techniques.visual_techniques.low_level_techniques
4 | handler: python
5 | options:
6 | heading_level: 3
7 | filters:
8 | - '!^LowLevelVisual$'
9 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/evaluation/metrics/ranking_metrics.md:
--------------------------------------------------------------------------------
1 | # Ranking metrics
2 |
3 | Ranking metrics evaluate the quality of the recommendation lists
4 |
5 | ::: clayrs.evaluation.metrics.ranking_metrics
6 | handler: python
7 | options:
8 | filters:
9 | - "!^_[^_]"
10 | - "!^RankingMetric$"
11 | - "!.*def.*"
--------------------------------------------------------------------------------
/clayrs/content_analyzer/field_content_production_techniques/embedding_technique/__init__.py:
--------------------------------------------------------------------------------
1 | from .combining_technique import Centroid, Sum, SingleToken
2 | from .embedding_technique import WordEmbeddingTechnique, SentenceEmbeddingTechnique, DocumentEmbeddingTechnique, \
3 | Word2SentenceEmbedding, Sentence2DocEmbedding, Word2DocEmbedding, Sentence2WordEmbedding
4 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/ratings/ratings.md:
--------------------------------------------------------------------------------
1 | # Ratings class
2 |
3 | The `Ratings` class is the main responsible for importing a dataset containing interactions between users and items
4 |
5 | ::: clayrs.content_analyzer.ratings_manager.Ratings
6 | handler: python
7 | options:
8 | show_root_toc_entry: true
9 | show_root_heading: true
10 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/field_content_production_techniques/__init__.py:
--------------------------------------------------------------------------------
1 | from . import embedding_technique
2 |
3 | from .embedding_technique import *
4 | from .visual_techniques import *
5 | from .tf_idf import WhooshTfIdf, SkLearnTfIdf
6 | from .field_content_production_technique import OriginalData, FromNPY
7 | from .synset_document_frequency import PyWSDSynsetDocumentFrequency
8 |
--------------------------------------------------------------------------------
/clayrs/utils/const.py:
--------------------------------------------------------------------------------
1 | import os
2 | from clayrs.utils.custom_logger import get_custom_logger
3 |
4 | THIS_DIR = os.path.dirname(os.path.abspath(__file__))
5 | root_path = os.path.join(THIS_DIR, '../../')
6 | contents_path = os.path.join(root_path, 'contents/')
7 | datasets_path = os.path.join(root_path, 'datasets/')
8 |
9 | logger = get_custom_logger('custom_logger')
10 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/config.md:
--------------------------------------------------------------------------------
1 | # Content Analyzer Config
2 |
3 | ::: clayrs.content_analyzer.config
4 | handler: python
5 |
6 |
7 | ## Content Analyzer Class
8 |
9 | ::: clayrs.content_analyzer.ContentAnalyzer
10 | handler: python
11 | options:
12 | heading_level: 3
13 | show_root_toc_entry: true
14 | show_root_heading: true
15 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/experiment.md:
--------------------------------------------------------------------------------
1 | # Experiment class
2 |
3 | ::: clayrs.recsys.ContentBasedExperiment
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ::: clayrs.recsys.GraphBasedExperiment
10 | handler: python
11 | options:
12 | show_root_toc_entry: true
13 | show_root_heading: true
14 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/evaluation/metrics/fairness_metrics.md:
--------------------------------------------------------------------------------
1 | # Fairness metrics
2 |
3 | Fairness metrics evaluate how unbiased the recommendation lists are (e.g. unbiased towards popularity of the items)
4 |
5 | ::: clayrs.evaluation.metrics.fairness_metrics
6 | handler: python
7 | options:
8 | filters:
9 | - "!^_[^_]"
10 | - "!^FairnessMetric$"
11 | - "!.*def.*"
--------------------------------------------------------------------------------
/docs/mkdocs/docs/javascripts/mathjax.js:
--------------------------------------------------------------------------------
1 | window.MathJax = {
2 | tex: {
3 | inlineMath: [["\\(", "\\)"]],
4 | displayMath: [["\\[", "\\]"]],
5 | processEscapes: true,
6 | processEnvironments: true
7 | },
8 | options: {
9 | ignoreHtmlClass: ".*|",
10 | processHtmlClass: "arithmatex"
11 | }
12 | };
13 |
14 | document$.subscribe(() => {
15 | MathJax.typesetPromise()
16 | })
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/tfidf.md:
--------------------------------------------------------------------------------
1 | # TfIdf
2 |
3 | ::: clayrs.content_analyzer.SkLearnTfIdf
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ::: clayrs.content_analyzer.WhooshTfIdf
10 | handler: python
11 | options:
12 | show_root_toc_entry: true
13 | show_root_heading: true
--------------------------------------------------------------------------------
/test/test_files/users_info.json:
--------------------------------------------------------------------------------
1 | [{"user_id": "1", "name": "Roberto", "birth_date": "10-09-1998"},
2 | {"user_id": "2", "name": "Mattia", "birth_date": "11-10-1996"},
3 | {"user_id": "3", "name": "Francesco", "birth_date": "01-03-1995"},
4 | {"user_id": "4", "name": "Carlo", "birth_date": "07-09-1992"},
5 | {"user_id": "5", "name": "Pasquale", "birth_date": "13-11-1998"},
6 | {"user_id": "6", "name": "Sergio", "birth_date": "06-05-1998"}]
--------------------------------------------------------------------------------
/test/test_files/test_dbpedia/movies_info_reduced.json:
--------------------------------------------------------------------------------
1 | [
2 | {"Title": "Jumanji", "Year": "1995", "Rated": "PG", "Released": "15 Dec 1995", "Budget_source": "6.5E7", "cinematography": "", "only_local": "", "wiki_id": "3700174", "runtime (m)": "104.0"},
3 | {"Title": "Inception", "Budget_source": "1.6E8", "only_local": ""},
4 | {"Title": "Demon Island"},
5 | {"Title": "Léon: The Professional"},
6 | {"Title": "not_exiiiiissstss"}
7 | ]
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/graph_based/graphs/nx_full.md:
--------------------------------------------------------------------------------
1 | # Full Graph
2 |
3 | Please remember that this class is a subclass of [NXTripartiteGraph][clayrs.recsys.NXTripartiteGraph],
4 | so it inherits all its methods. You can check their documentation as well!
5 |
6 | ::: clayrs.recsys.graphs.nx_implementation.nx_full_graphs.NXFullGraph
7 | handler: python
8 | options:
9 | show_root_toc_entry: true
10 | show_root_heading: true
--------------------------------------------------------------------------------
/test/test_files/test_ratings/ratings_1591277020.csv:
--------------------------------------------------------------------------------
1 | 01,a,0.2333333333333333,1234567,not so good,I expected more from this product,2.0
2 | 01,b,0.8333333333333334,1234567,perfect,I love this product,5.0
3 | 01,c,0.8666666666666667,1234567,awesome,The perfect gift for my darling,4.0
4 | 02,a,-0.3666666666666667,1234567,a disaster,Too much expensive ,1.0
5 | 02,c,0.6,1234567,really good,A good compromise,3.5
6 | 03,b,0.6666666666666666,1234567,Awesome,,5.0
7 |
--------------------------------------------------------------------------------
/clayrs/evaluation/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .classification_metrics import Precision, PrecisionAtK, RPrecision, Recall, RecallAtK, \
2 | FMeasure, FMeasureAtK
3 | from .error_metrics import MAE, MSE, RMSE
4 | from .fairness_metrics import GiniIndex, DeltaGap, PredictionCoverage, CatalogCoverage
5 | from .plot_metrics import PopRatioProfileVsRecs, PopRecsCorrelation, LongTailDistr
6 | from .ranking_metrics import NDCG, NDCGAtK, MRR, MRRAtK, Correlation, MAP
7 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/ratings/score_processors.md:
--------------------------------------------------------------------------------
1 | # Score Processors
2 |
3 | ::: clayrs.content_analyzer.ratings_manager.NumberNormalizer
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ::: clayrs.content_analyzer.ratings_manager.TextBlobSentimentAnalysis
10 | handler: python
11 | options:
12 | show_root_toc_entry: true
13 | show_root_heading: true
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/graph_based/graphs/nx_tripartite.md:
--------------------------------------------------------------------------------
1 | # Tripartite Graph
2 |
3 | Please remember that this class is a subclass of [NXBipartiteGraph][clayrs.recsys.NXBipartiteGraph],
4 | so it inherits all its methods. You can check their documentation as well!
5 |
6 | ::: clayrs.recsys.graphs.nx_implementation.nx_tripartite_graphs.NXTripartiteGraph
7 | handler: python
8 | options:
9 | show_root_toc_entry: true
10 | show_root_heading: true
11 |
--------------------------------------------------------------------------------
/clayrs/recsys/graphs/graph_metrics.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 |
4 | class GraphMetrics(ABC):
5 | """
6 | Interface for graph metrics
7 | """
8 |
9 | @abstractmethod
10 | def degree_centrality(self):
11 | raise NotImplementedError
12 |
13 | @abstractmethod
14 | def closeness_centrality(self):
15 | raise NotImplementedError
16 |
17 | @abstractmethod
18 | def dispersion(self):
19 | raise NotImplementedError
20 |
--------------------------------------------------------------------------------
/test/utils/test_load_content.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest import TestCase
3 | from clayrs.utils.load_content import load_content_instance
4 | from test import dir_test_files
5 |
6 | movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/')
7 |
8 |
9 | class Test(TestCase):
10 | def test_load_content_instance(self):
11 | self.assertIsNone(load_content_instance("not_existent", "invalid_item"))
12 | self.assertIsNotNone(load_content_instance(movies_dir, "tt0112281"))
13 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/evaluation/metrics/classification_metrics.md:
--------------------------------------------------------------------------------
1 | # Classification metrics
2 |
3 | A classification metric uses confusion matrix terminology (true positive, false positive, true negative, false negative)
4 | to classify each item predicted, and in general it needs a way to discern relevant items from non-relevant items for
5 | users
6 |
7 | ::: clayrs.evaluation.metrics.classification_metrics
8 | handler: python
9 | options:
10 | filters:
11 | - "!^_[^_]"
12 | - "!^ClassificationMetric$"
--------------------------------------------------------------------------------
/docs/mkdocs/docs/evaluation/statistical_tests/paired.md:
--------------------------------------------------------------------------------
1 | # Paired statistical tests
2 |
3 | ::: clayrs.evaluation.statistical_test.PairedTest
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ::: clayrs.evaluation.statistical_test.Ttest
10 | handler: python
11 | options:
12 | show_root_toc_entry: true
13 | show_root_heading: true
14 |
15 | ::: clayrs.evaluation.statistical_test.Wilcoxon
16 | handler: python
17 | options:
18 | show_root_toc_entry: true
19 | show_root_heading: true
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/information_preprocessors/visual_preprocessors/torch_preprocessors.md:
--------------------------------------------------------------------------------
1 | # Torch Preprocessors
2 |
3 | ## Torch transformers
4 |
5 | ::: clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_transformer
6 | handler: python
7 | options:
8 | heading_level: 3
9 | filters:
10 | - '!^TorchBuiltInTransformer$'
11 |
12 | ---
13 |
14 | ## Torch augmenters
15 |
16 | ::: clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_augmenter
17 | handler: python
18 | options:
19 | heading_level: 3
20 |
--------------------------------------------------------------------------------
/clayrs/recsys/__init__.py:
--------------------------------------------------------------------------------
1 | from . import content_based_algorithm
2 | from . import graph_based_algorithm
3 | from . import graphs
4 |
5 | from .content_based_algorithm import *
6 | from .graph_based_algorithm import *
7 | from .graphs import *
8 | from .visual_based_algorithm import *
9 | from .recsys import ContentBasedRS, GraphBasedRS
10 | from .partitioning import KFoldPartitioning, HoldOutPartitioning, BootstrapPartitioning
11 | from .methodology import TestRatingsMethodology, TestItemsMethodology, TrainingItemsMethodology, AllItemsMethodology
12 | from .experiment import ContentBasedExperiment, GraphBasedExperiment
13 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/content_based/content_based_algorithms/linear_predictor.md:
--------------------------------------------------------------------------------
1 | # Linear Predictor
2 |
3 | ::: clayrs.recsys.content_based_algorithm.regressor.linear_predictor.LinearPredictor
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ---
10 |
11 | ## Regressors Implemented
12 |
13 | The following are the regressors you can use in the `regressor` parameter of the `LinearPredictor` class
14 |
15 |
16 | ::: clayrs.recsys.content_based_algorithm.regressor.regressors
17 | handler: python
18 | options:
19 | heading_level: 3
20 | filters:
21 | - '!^Regressor$'
--------------------------------------------------------------------------------
/test/test_files/test_images/tradesy_small_local_relative_paths.json:
--------------------------------------------------------------------------------
1 | [
2 | {"imagePath": "test/test_files/test_images/images_files/haute-hippie-top-white-and-black-1015-1.jpg", "itemID": "1015"},
3 | {"imagePath": "test/test_files/test_images/images_files/anthropologie-skirt-light-pink-434-1.jpg", "itemID": "434"},
4 | {"imagePath": "test/test_files/test_images/images_files/wildfox-floral-print-leggings-357-1.jpg", "itemID": "357"},
5 | {"imagePath": "test/test_files/test_images/images_files/o-neill-dress-black-and-white-164-1.jpg", "itemID": "164"},
6 | {"imagePath": "test/test_files/test_images/images_files/elizabeth-and-james-top-ecru-and-pink-81-1.jpg", "itemID": "81"}
7 | ]
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/content_based/content_based_algorithms/centroid_vector.md:
--------------------------------------------------------------------------------
1 | # Centroid Vector
2 |
3 | ::: clayrs.recsys.content_based_algorithm.centroid_vector.CentroidVector
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ---
10 |
11 | ## Similarities implemented
12 |
13 | The following are similarities you can use in the `similarity` parameter of the `CentroidVector` class
14 |
15 | ::: clayrs.recsys.content_based_algorithm.centroid_vector.similarities.CosineSimilarity
16 | handler: python
17 | options:
18 | heading_level: 3
19 | show_root_toc_entry: true
20 | show_root_heading: true
21 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/content_based/content_based_algorithms/classifier_recommender.md:
--------------------------------------------------------------------------------
1 | # Classifier Recommender
2 |
3 | ::: clayrs.recsys.content_based_algorithm.classifier.classifier_recommender.ClassifierRecommender
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ---
10 |
11 | ## Classifiers Implemented
12 |
13 | The following are the classifiers you can use in the `classifier` parameter of the `ClassifierRecommender` class
14 |
15 |
16 | ::: clayrs.recsys.content_based_algorithm.classifier.classifiers
17 | handler: python
18 | options:
19 | heading_level: 3
20 | filters:
21 | - '!^Classifier$'
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/test/content_analyzer/utils/test_check_tokenization.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from clayrs.content_analyzer.utils.check_tokenization import check_not_tokenized, check_tokenized, tokenize_in_sentences
3 |
4 |
5 | class Test(TestCase):
6 | def test_check_tokenized(self):
7 | str_ = 'abcd efg'
8 | list_ = ['abcd', 'efg']
9 | check_tokenized(str_)
10 | check_tokenized(list_)
11 | check_not_tokenized(str_)
12 | check_not_tokenized(list_)
13 |
14 | def test_tokenize_sentence(self):
15 |
16 | phrases = "Ciao, questa è una prova. Anche questa. And this is the third"
17 | result = tokenize_in_sentences(phrases)
18 |
19 | self.assertTrue(len(result) == 3)
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas~=1.5.3
2 | numpy~=1.22.4
3 | gensim~=4.3.0
4 | nltk~=3.5
5 | babelpy~=1.0.1
6 | mysql~=0.0.3
7 | mysql-connector-python~=8.0.20
8 | scikit-learn==1.0.2
9 | SPARQLWrapper~=1.8.5
10 | textblob~=0.15.3
11 | matplotlib~=3.5.0
12 | pywsd~=1.2.4
13 | wn~=0.0.23
14 | networkx~=2.6.3
15 | whoosh~=2.7.4
16 | sentence-transformers~=1.2.0
17 | colorama~=0.4.4
18 | tqdm~=4.62.2
19 | spacy~=3.2.1
20 | ekphrasis~=0.5.4
21 | scipy~=1.7.3
22 | torch~=1.13.0
23 | transformers~=4.29.2
24 | pyaml~=21.10.1
25 | PyYAML~=6.0.1
26 | distex~=0.7.1
27 | nest-asyncio~=1.5.5
28 | validators~=0.20.0
29 | requests~=2.28.2
30 | timm~=0.6.12
31 | scikit-image~=0.19.3
32 | torchvision~=0.14.1
33 | numpy-indexed~=0.3.5
34 | Pillow~=9.4.0
35 | opencv-python~=4.7.0.72
36 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Additional context**
32 | Add any other context about the problem here.
33 |
--------------------------------------------------------------------------------
/test/test_files/test_import_ratings.json:
--------------------------------------------------------------------------------
1 | [{"user_id": "01", "item_id": "a", "review_title": "not so good","text": "I expected more from this product", "stars": 2.0, "timestamp": 1234567},
2 | {"user_id": "01", "item_id": "b", "review_title": "perfect","text": "I love this product", "stars": 5.0, "timestamp": 1234567},
3 | {"user_id": "02", "item_id": "a", "review_title": "a disaster","text": "Too much expensive ", "stars": 1.0, "timestamp": 1234567},
4 | {"user_id": "02", "item_id": "c", "review_title": "really good","text": "A good compromise", "stars": 3.5, "timestamp": 1234567},
5 | {"user_id": "03", "item_id": "b", "review_title": "Awesome","text": "", "stars": 5.0, "timestamp": 1234567},
6 | {"user_id": "01", "item_id": "c", "review_title": "awesome","text": "The perfect gift for my darling", "stars": 4.0, "timestamp": 1234567}
7 | ]
8 |
--------------------------------------------------------------------------------
/test/content_analyzer/utils/test_id_merger.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from clayrs.content_analyzer.utils.id_merger import id_values_merger
4 |
5 |
6 | class Test(TestCase):
7 | def test_id_merger(self):
8 | self.assertEqual(id_values_merger('aaa'), 'aaa', "Must return a string value")
9 | self.assertEqual(id_values_merger(['aaa', 'bbb']), 'aaa_bbb', "Must return a string value like this aaa_bbb")
10 | self.assertEqual(id_values_merger(123), '123', "Must return a string value")
11 | self.assertEqual(id_values_merger([123, 124]), '123_124', "Must return a string value like this 123_124")
12 | self.assertEqual(id_values_merger([123, "aaa"]), '123_aaa', "Must return a string value like 123_aaa")
13 | with self.assertRaises(TypeError):
14 | id_values_merger({1: 1, 2: 2})
15 |
--------------------------------------------------------------------------------
/.github/workflows/docs_building.yml:
--------------------------------------------------------------------------------
1 | name: Build documentation
2 | on:
3 | push:
4 | branches:
5 | - master
6 | workflow_dispatch:
7 |
8 | jobs:
9 | main:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | -
14 | name: Checkout Repository
15 | uses: actions/checkout@v2
16 | with:
17 | fetch-depth: 0
18 | -
19 | name: Setup python
20 | uses: actions/setup-python@v2
21 | with:
22 | python-version: 3.8 #install the python needed
23 | -
24 | name: Setup documentation requirements
25 | run: |
26 | pip install --upgrade pip
27 | pip install -r docs/mkdocs/requirements-doc.txt
28 | -
29 | name: Deploy documentation
30 | run: |
31 | cd docs/mkdocs
32 | mkdocs gh-deploy --clean
33 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/__init__.py:
--------------------------------------------------------------------------------
1 | from . import content_representation
2 | from . import embeddings
3 | from . import field_content_production_techniques
4 | from . import information_processor
5 | from . import memory_interfaces
6 | from . import ratings_manager
7 |
8 | from .content_representation import *
9 | from .embeddings import *
10 | from .field_content_production_techniques import *
11 | from .information_processor import *
12 | from .memory_interfaces import *
13 | from .ratings_manager import *
14 | from .config import ExogenousConfig, UserAnalyzerConfig, ItemAnalyzerConfig, FieldConfig
15 | from .content_analyzer_main import ContentAnalyzer
16 | from .exogenous_properties_retrieval import DBPediaMappingTechnique, PropertiesFromDataset, BabelPyEntityLinking
17 | from .raw_information_source import CSVFile, JSONFile, DATFile, SQLDatabase
18 |
19 |
20 |
--------------------------------------------------------------------------------
/test/content_analyzer/field_content_production_techniques/test_synset_document_frequency.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import os
3 |
4 | from clayrs.content_analyzer.content_representation.content import FeaturesBagField
5 | from clayrs.content_analyzer.raw_information_source import JSONFile
6 | from clayrs.content_analyzer.field_content_production_techniques import PyWSDSynsetDocumentFrequency
7 | from test import dir_test_files
8 |
9 | file_path = os.path.join(dir_test_files, "movies_info_reduced.json")
10 |
11 |
12 | class TestSynsetDocumentFrequency(TestCase):
13 | def test_produce_content(self):
14 | technique = PyWSDSynsetDocumentFrequency()
15 |
16 | features_bag_list = technique.produce_content("Plot", [], [], JSONFile(file_path))
17 |
18 | self.assertEqual(len(features_bag_list), 20)
19 | self.assertIsInstance(features_bag_list[0], FeaturesBagField)
20 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/graph_based/graphs/nodes.md:
--------------------------------------------------------------------------------
1 | # Nodes categories
2 |
3 | The followings are all the various category of nodes that can be added to a graph.
4 |
5 | !!! info
6 |
7 | Please note that there exists [Bipartite Graph][clayrs.recsys.NXBipartiteGraph],
8 | [Tripartite Graph][clayrs.recsys.NXTripartiteGraph] and
9 | [Full Graph][clayrs.recsys.NXFullGraph], all with their peculiarities and restrictions.
10 |
11 | Check their documentation for more!
12 |
13 | ::: clayrs.recsys.graphs.UserNode
14 | handler: python
15 | options:
16 | show_root_toc_entry: true
17 | show_root_heading: true
18 |
19 | ::: clayrs.recsys.graphs.ItemNode
20 | handler: python
21 | options:
22 | show_root_toc_entry: true
23 | show_root_heading: true
24 |
25 | ::: clayrs.recsys.graphs.PropertyNode
26 | handler: python
27 | options:
28 | show_root_toc_entry: true
29 | show_root_heading: true
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_learner/test_doc2vec.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import os
3 | import pathlib as pl
4 |
5 | from clayrs.content_analyzer.embeddings.embedding_learner.doc2vec import GensimDoc2Vec
6 | from clayrs.content_analyzer.information_processor.nltk_processor import NLTK
7 | from clayrs.content_analyzer.raw_information_source import JSONFile
8 | from test import dir_test_files
9 |
10 | file_path = os.path.join(dir_test_files, 'movies_info_reduced.json')
11 |
12 |
13 | class TestGensimDoc2Vec(TestCase):
14 | def test_fit(self):
15 | model_path = "./model_test_Doc2Vec"
16 | learner = GensimDoc2Vec(model_path, True)
17 | learner.fit(source=JSONFile(file_path), field_list=["Plot", "Genre"], preprocessor_list=[NLTK()])
18 | model_path += ".kv"
19 |
20 | self.assertEqual(learner.get_embedding("ace").any(), True)
21 | self.assertEqual(pl.Path(model_path).resolve().is_file(), True)
22 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_learner/test_word2vec.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import os
3 | import pathlib as pl
4 |
5 | from clayrs.content_analyzer.embeddings.embedding_learner import GensimWord2Vec
6 | from clayrs.content_analyzer.information_processor.nltk_processor import NLTK
7 | from clayrs.content_analyzer.raw_information_source import JSONFile
8 | from test import dir_test_files
9 |
10 | file_path = os.path.join(dir_test_files, 'movies_info_reduced.json')
11 |
12 |
13 | class TestGensimWord2Vec(TestCase):
14 | def test_fit(self):
15 | model_path = "./model_test_Word2Vec"
16 | learner = GensimWord2Vec(model_path, True)
17 | learner.fit(source=JSONFile(file_path), field_list=["Plot", "Genre"], preprocessor_list=[NLTK()])
18 | model_path += ".kv"
19 |
20 | self.assertEqual(learner.get_embedding("ace").any(), True)
21 | self.assertEqual(pl.Path(model_path).resolve().is_file(), True)
22 |
23 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_learner/test_fasttext.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import os
3 | import pathlib as pl
4 |
5 | from clayrs.content_analyzer.embeddings.embedding_learner import GensimFastText
6 | from clayrs.content_analyzer.information_processor.nltk_processor import NLTK
7 | from clayrs.content_analyzer.raw_information_source import JSONFile
8 | from test import dir_test_files
9 |
10 | file_path = os.path.join(dir_test_files, 'movies_info_reduced.json')
11 |
12 |
13 | class TestGensimFastText(TestCase):
14 | def test_fit(self):
15 | model_path = "./model_test_FastText"
16 | learner = GensimFastText(model_path, True)
17 | learner.fit(source=JSONFile(file_path), field_list=["Plot", "Genre"], preprocessor_list=[NLTK()])
18 | model_path += ".kv"
19 |
20 | self.assertEqual(learner.get_embedding("ace").any(), True)
21 | self.assertEqual(pl.Path(model_path).resolve().is_file(), True)
22 |
23 |
24 |
--------------------------------------------------------------------------------
/test/content_analyzer/ratings_manager/test_sentiment_analysis.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from clayrs.content_analyzer.ratings_manager.sentiment_analysis import TextBlobSentimentAnalysis
4 | from textblob import TextBlob
5 | import numpy as np
6 |
7 |
8 | class TestTextBlobSentimentAnalysis(TestCase):
9 | def test_fit(self):
10 | text_reviews = ['good item', 'it was awful', 'pretty good', 'extraordinary', 'too much expensive']
11 |
12 | result = [TextBlobSentimentAnalysis().fit(text) for text in text_reviews]
13 | expected = [TextBlob(field_data).sentiment.polarity for field_data in text_reviews]
14 |
15 | self.assertEqual(expected, result)
16 |
17 | result_rounded = [TextBlobSentimentAnalysis(decimal_rounding=4).fit(text) for text in text_reviews]
18 | expected_rounded = [np.round(TextBlob(field_data).sentiment.polarity, 4) for field_data in text_reviews]
19 |
20 | self.assertEqual(expected_rounded, result_rounded)
21 |
--------------------------------------------------------------------------------
/clayrs/recsys/algorithm.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from abc import ABC
3 |
4 |
5 | class Algorithm(ABC):
6 | """
7 | Abstract class for an Algorithm.
8 |
9 | Every algorithm must be able to predict or to rank, or maybe both.
10 | In case some algorithms can only do one of the two (eg. PageRank), simply implement both
11 | methods and raise the NotPredictionAlg or NotRankingAlg exception accordingly.
12 | """
13 | __slots__ = ()
14 |
15 | @abc.abstractmethod
16 | def predict(self, **kwargs):
17 | """
18 | Method to call when score prediction must be done.
19 |
20 | If the Algorithm can't do score prediction, implement this method and raise
21 | the NotPredictionAlg exception
22 | """
23 | raise NotImplementedError
24 |
25 | @abc.abstractmethod
26 | def rank(self, **kwargs):
27 | """
28 | Method to call when ranking must be done.
29 |
30 | If the Algorithm can't rank, implement this method and raise the NotRankingAlg exception
31 | """
32 | raise NotImplementedError
33 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/document_embeddings.md:
--------------------------------------------------------------------------------
1 | # Document Embeddings
2 |
3 | Via the following, you can obtain embeddings of ***document*** granularity
4 |
5 | ```python
6 | from clayrs import content_analyzer as ca
7 |
8 | # obtain document embeddings by training LDA model
9 | # on corpus of contents to complexly represent
10 | ca.DocumentEmbeddingTechnique(embedding_source=ca.GensimLDA())
11 | ```
12 |
13 | ::: clayrs.content_analyzer.DocumentEmbeddingTechnique
14 | handler: python
15 | options:
16 | show_root_toc_entry: true
17 | show_root_heading: true
18 |
19 | ## Document Embedding models
20 |
21 | ::: clayrs.content_analyzer.GensimLatentSemanticAnalysis
22 | handler: python
23 | options:
24 | heading_level: 3
25 | show_root_toc_entry: true
26 | show_root_heading: true
27 |
28 | ::: clayrs.content_analyzer.GensimLDA
29 | handler: python
30 | options:
31 | heading_level: 3
32 | show_root_toc_entry: true
33 | show_root_heading: true
34 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_loader/test_sbert.py:
--------------------------------------------------------------------------------
1 | from random import random
2 | from unittest import TestCase, mock
3 | import numpy as np
4 |
5 | from clayrs.content_analyzer.embeddings import Sbert
6 |
7 | result_matrix = {
8 | 'this is a phrase': np.array([random() for _ in range(768)]),
9 | 'this is another phrase': np.array([random() for _ in range(768)])
10 | }
11 |
12 |
13 | def encode(sentence, show_progress_bar):
14 | return result_matrix[sentence]
15 |
16 |
17 | class TestSbert(TestCase):
18 |
19 | @mock.patch('clayrs.content_analyzer.embeddings.sbert.SentenceTransformer')
20 | def test_sbert(self, mocked_model):
21 | instance = mocked_model.return_value
22 | instance.get_sentence_embedding_dimension.return_value = 768
23 | instance.encode.side_effect = encode
24 |
25 | source = Sbert()
26 |
27 | vector_size = source.get_vector_size()
28 |
29 | result = source.load(["this is a phrase", "this is another phrase"])
30 |
31 | self.assertEqual(len(result), 2)
32 | self.assertEqual(len(result[0]), vector_size)
33 | self.assertEqual(len(result[1]), vector_size)
34 |
--------------------------------------------------------------------------------
/test/utils/test_class_utils.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from clayrs.content_analyzer.config import ContentAnalyzerConfig, FieldConfig, ItemAnalyzerConfig, \
4 | UserAnalyzerConfig
5 | from clayrs.utils.class_utils import get_all_implemented_classes, get_all_implemented_subclasses
6 |
7 |
8 | class TestClassUtils(TestCase):
9 |
10 | def test_get_all_implemented_classes(self):
11 |
12 | results = get_all_implemented_classes(ContentAnalyzerConfig)
13 |
14 | expected_results = {ItemAnalyzerConfig, UserAnalyzerConfig}
15 | self.assertEqual(results, expected_results)
16 |
17 | results = get_all_implemented_classes(FieldConfig)
18 |
19 | expected_results = {FieldConfig}
20 | self.assertEqual(results, expected_results)
21 |
22 | def test_get_all_implemented_subclasses(self):
23 |
24 | results = get_all_implemented_subclasses(ContentAnalyzerConfig)
25 |
26 | expected_results = {ItemAnalyzerConfig, UserAnalyzerConfig}
27 | self.assertEqual(results, expected_results)
28 |
29 | results = get_all_implemented_subclasses(FieldConfig)
30 | expected_results = set()
31 |
32 | self.assertEqual(results, expected_results)
--------------------------------------------------------------------------------
/docs/mkdocs/docs/first_steps/installation.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | ## Via PIP recommended { data-toc-label="Via PIP" }
4 |
5 | *ClayRS* requires Python **3.7** or later, while package dependencies are in `requirements.txt` and are all installable
6 | via `pip`, as *ClayRS* itself.
7 |
8 | To install it execute the following command:
9 |
10 | === "Latest"
11 |
12 | ``` sh
13 | pip install clayrs
14 | ```
15 |
16 | This will automatically install compatible versions of all dependencies.
17 |
18 | ---
19 | **Tip**: We suggest installing ClayRS (or any python package, for that matters) in a virtual environment
20 |
21 | !!! quote ""
22 | *Virtual environments are special isolated environments where all the packages and versions you install only
23 | apply to that specific environment. It’s like a private island! — but for code.*
24 |
25 | Read this [Medium article][medium] for understanding all the advantages and the [official python guide] [venv]
26 | on how to set up one
27 |
28 | [medium]: https://towardsdatascience.com/why-you-should-use-a-virtual-environment-for-every-python-project-c17dab3b0fd0
29 | [venv]: https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/
--------------------------------------------------------------------------------
/test/test_files/test_images/tradesy_small_online.json:
--------------------------------------------------------------------------------
1 | [
2 | {"productUrl": "https://www.tradesy.com/tops/haute-hippie-top-white-and-black-1015/?tref=category", "imageUrl": "https://item1.tradesy.com/images/item/2/tops/haute-hippie/8-m/haute-hippie-top-white-and-black-1015-1.jpg", "itemID": "1015"},
3 | {"productUrl": "https://www.tradesy.com/bottoms/anthropologie-skirt-light-pink-434/?tref=category", "imageUrl": "https://item5.tradesy.com/images/item/2/bottoms/anthropologie/0-xs-25/anthropologie-skirt-light-pink-434-1.jpg", "itemID": "434"},
4 | {"productUrl": "https://www.tradesy.com/bottoms/wildfox-floral-print-leggings-357/?tref=category", "imageUrl": "https://item3.tradesy.com/images/item/2/bottoms/wildfox/0-xs-25/wildfox-floral-print-leggings-357-1.jpg", "itemID": "357"},
5 | {"productUrl": "https://www.tradesy.com/dresses/o-neill-dress-black-and-white-164/?tref=category", "imageUrl": "https://item5.tradesy.com/images/item/2/dresses/o-neill/8-m/o-neill-dress-black-and-white-164-1.jpg", "itemID": "164"},
6 | {"productUrl": "https://www.tradesy.com/tops/elizabeth-and-james-top-ecru-and-pink-81/?tref=category", "imageUrl": "https://item2.tradesy.com/images/item/2/tops/elizabeth-and-james/0-xs/elizabeth-and-james-top-ecru-and-pink-81-1.jpg", "itemID": "81"}
7 | ]
--------------------------------------------------------------------------------
/test/content_analyzer/field_content_production_techniques/test_tf_idf.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import os
3 |
4 | from clayrs.content_analyzer.content_representation.content import FeaturesBagField
5 | from clayrs.content_analyzer.field_content_production_techniques.tf_idf import WhooshTfIdf, SkLearnTfIdf
6 | from clayrs.content_analyzer.raw_information_source import JSONFile
7 | from test import dir_test_files
8 |
9 | THIS_DIR = os.path.dirname(os.path.abspath(__file__))
10 | file_path = os.path.join(dir_test_files, "movies_info_reduced.json")
11 |
12 |
13 | class TestWhooshTfIdf(TestCase):
14 | def test_produce_content(self):
15 | technique = WhooshTfIdf()
16 |
17 | features_bag_list = technique.produce_content("Plot", [], [], JSONFile(file_path))
18 |
19 | self.assertEqual(len(features_bag_list), 20)
20 | self.assertIsInstance(features_bag_list[0], FeaturesBagField)
21 |
22 |
23 | class TestSkLearnTfIdf(TestCase):
24 |
25 | def test_produce_content(self):
26 | technique = SkLearnTfIdf()
27 |
28 | features_bag_list = technique.produce_content("Title", [], [], JSONFile(file_path))
29 |
30 | self.assertEqual(len(features_bag_list), 20)
31 | self.assertIsInstance(features_bag_list[0], FeaturesBagField)
32 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/utils/check_tokenization.py:
--------------------------------------------------------------------------------
1 | from typing import Union, List
2 | from nltk import data, download, sent_tokenize, word_tokenize
3 |
4 | # nltk corpus
5 | corpus_downloaded = False
6 |
7 |
8 | def check_tokenized(text):
9 | """
10 | Tokenizes a text
11 | """
12 | if type(text) is str:
13 | global corpus_downloaded
14 |
15 | if not corpus_downloaded:
16 | try:
17 | data.find('punkt')
18 | except LookupError:
19 | download('punkt')
20 |
21 | corpus_downloaded = True
22 |
23 | text = word_tokenize(text)
24 |
25 | return text
26 |
27 |
28 | def check_not_tokenized(text):
29 | """
30 | Untokenizes a tokenized text
31 | """
32 | if type(text) is list:
33 | text = ' '.join(text)
34 |
35 | return text
36 |
37 |
38 | def tokenize_in_sentences(text: Union[List[str], str]):
39 | """
40 | Tokenizes a text into sentences
41 | """
42 | global corpus_downloaded
43 |
44 | if not corpus_downloaded:
45 | try:
46 | data.find('punkt')
47 | except LookupError:
48 | download('punkt')
49 |
50 | corpus_downloaded = True
51 |
52 | return sent_tokenize(check_not_tokenized(text))
53 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/sentence_embeddings.md:
--------------------------------------------------------------------------------
1 | # Sentence Embeddings
2 |
3 | Via the following, you can obtain embeddings of ***sentence*** granularity
4 |
5 | ```python
6 | from clayrs import content_analyzer as ca
7 |
8 | # obtain sentence embeddings using pre-trained model 'glove-twitter-50'
9 | # from SBERT library
10 | ca.SentenceEmbeddingTechnique(embedding_source=ca.Sbert('paraphrase-distilroberta-base-v1'))
11 | ```
12 |
13 | ::: clayrs.content_analyzer.SentenceEmbeddingTechnique
14 | handler: python
15 | options:
16 | show_root_toc_entry: true
17 | show_root_heading: true
18 |
19 | ## Sentence Embedding models
20 |
21 | ::: clayrs.content_analyzer.BertTransformers
22 | handler: python
23 | options:
24 | heading_level: 3
25 | show_root_toc_entry: true
26 | show_root_heading: true
27 |
28 | ::: clayrs.content_analyzer.Sbert
29 | handler: python
30 | options:
31 | heading_level: 3
32 | show_root_toc_entry: true
33 | show_root_heading: true
34 |
35 | ::: clayrs.content_analyzer.T5Transformers
36 | handler: python
37 | options:
38 | heading_level: 3
39 | show_root_toc_entry: true
40 | show_root_heading: true
41 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/ratings_manager/sentiment_analysis.py:
--------------------------------------------------------------------------------
1 | from textblob import TextBlob
2 |
3 | from clayrs.content_analyzer.ratings_manager.score_processor import SentimentAnalysis
4 |
5 |
6 | class TextBlobSentimentAnalysis(SentimentAnalysis):
7 | """
8 | Class that compute sentiment polarity on a textual field using TextBlob library.
9 |
10 | The given score will be in the $[-1.0, 1.0]$ range
11 | """
12 | def __init__(self, decimal_rounding: int = None):
13 | super().__init__(decimal_rounding)
14 |
15 | def __str__(self):
16 | return "TextBlobSentimentalAnalysis"
17 |
18 | def __repr__(self):
19 | return f'TextBlobSentimentAnalysis'
20 |
21 | def fit(self, score_data: str) -> float:
22 | """
23 | This method calculates the sentiment polarity score on textual reviews
24 |
25 | Args:
26 | score_data: text for which sentiment polarity must be computed and considered as score
27 |
28 | Returns:
29 | The sentiment polarity of the textual data in range $[-1.0, 1.0]$
30 | """
31 | polarity_score = TextBlob(score_data).sentiment.polarity
32 |
33 | if self.decimal_rounding:
34 | polarity_score = round(polarity_score, self.decimal_rounding)
35 |
36 | return polarity_score
37 |
--------------------------------------------------------------------------------
/test/test_files/test_decode/movies_title_string.json:
--------------------------------------------------------------------------------
1 | [{"Title":"test","Year":"1995","Rated":"PG","Released":"15 Dec 1995","Runtime":"104 min","Genre":"Adventure, Family, Fantasy","Director":"Joe Johnston","Writer":"Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Actors":"Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","Plot":"After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","Language":"English, French","Country":"USA","Awards":"4 wins & 9 nominations.","Poster":"https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","Ratings":[{"Source":"Internet Movie Database","Value":"6.9/10"},{"Source":"Rotten Tomatoes","Value":"53%"},{"Source":"Metacritic","Value":"39/100"}],"Metascore":"39","imdbRating":"6.9","imdbVotes":"260,909","imdbID":"tt0113497","Type":"movie","DVD":"25 Jan 2000","BoxOffice":"N/A","Production":"Sony Pictures Home Entertainment","Website":"N/A","Response":"True"}]
--------------------------------------------------------------------------------
/test/test_files/test_decode/movies_title_tfidf.json:
--------------------------------------------------------------------------------
1 | [{"Title":"{'jumanji': 2}","Year":"1995","Rated":"PG","Released":"15 Dec 1995","Runtime":"104 min","Genre":"Adventure, Family, Fantasy","Director":"Joe Johnston","Writer":"Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Actors":"Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","Plot":"After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","Language":"English, French","Country":"USA","Awards":"4 wins & 9 nominations.","Poster":"https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","Ratings":[{"Source":"Internet Movie Database","Value":"6.9/10"},{"Source":"Rotten Tomatoes","Value":"53%"},{"Source":"Metacritic","Value":"39/100"}],"Metascore":"39","imdbRating":"6.9","imdbVotes":"260,909","imdbID":"tt0113497","Type":"movie","DVD":"25 Jan 2000","BoxOffice":"N/A","Production":"Sony Pictures Home Entertainment","Website":"N/A","Response":"True"}]
--------------------------------------------------------------------------------
/clayrs/utils/automatic_methods.py:
--------------------------------------------------------------------------------
1 | import inspect
2 |
3 |
4 | def autorepr(obj, frame):
5 | # pull tuple from frame
6 | args, args_paramname, kwargs_paramname, values = inspect.getargvalues(frame)
7 |
8 | args = args[1:] # remove 'self' argument from function
9 |
10 | arg_string = ''
11 |
12 | # add to arg string formal argument
13 | arg_string += ', '.join([f"{arg}={repr(values[arg])}"
14 | for arg in (args if args is not None else [])])
15 |
16 | # show positional varargs
17 | if args_paramname is not None:
18 | varglist = values[args_paramname]
19 | if len(arg_string) != 0:
20 | arg_string += ', '
21 | arg_string += ', '.join([f"*{args_paramname}={repr(v)}"
22 | for v in (varglist if varglist is not None else [])])
23 |
24 | # show named varargs
25 | if kwargs_paramname is not None:
26 | varglist = values[kwargs_paramname]
27 | if len(arg_string) != 0:
28 | arg_string += ', '
29 | arg_string += ', '.join([f"*{kwargs_paramname}_{k}={repr(varglist[k])}"
30 | for k in (sorted(varglist) if varglist is not None else [])])
31 |
32 | name_obj = obj.__class__.__name__
33 | repr_string = f"{name_obj}({arg_string})"
34 |
35 | return repr_string
36 |
--------------------------------------------------------------------------------
/.github/workflows/testing_pipeline.yml:
--------------------------------------------------------------------------------
1 | name: Testing pipeline
2 | on:
3 | push:
4 | branches:
5 | - master
6 | pull_request:
7 | branches:
8 | - master
9 | workflow_dispatch:
10 |
11 | jobs:
12 | main:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | matrix:
16 | python-version: ['3.8', '3.10']
17 |
18 | steps:
19 | -
20 | name: Checkout Repository
21 | uses: actions/checkout@v3
22 | -
23 | name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v4
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | -
28 | name: Setup installation requirements
29 | run: |
30 | pip install --upgrade pip wheel
31 | pip install pytest-cov==3.00
32 | pip install -e .
33 | -
34 | name: Testing Python ${{ matrix.python-version }} with coverage
35 | run: |
36 | pytest --color=yes --cov-report xml:codecoverage_${{ matrix.python-version }}.xml --cov=clayrs test/
37 | -
38 | name: Upload coverage to Codecov
39 | uses: codecov/codecov-action@v3
40 | with: # no token required for public repos
41 | fail_ci_if_error: true
42 | files: ./codecoverage_${{ matrix.python-version }}.xml
43 | flags: python_${{ matrix.python-version }}
44 |
--------------------------------------------------------------------------------
/clayrs/recsys/content_based_algorithm/centroid_vector/similarities.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Union
3 |
4 | import numpy as np
5 | from scipy import sparse
6 | from sklearn.metrics.pairwise import cosine_similarity
7 |
8 |
9 | class Similarity(ABC):
10 | """
11 | Abstract Class for the various types of similarity
12 | """
13 | def __init__(self):
14 | pass
15 |
16 | @abstractmethod
17 | def perform(self, v1: Union[np.ndarray, sparse.csr_matrix], v2: Union[np.ndarray, sparse.csr_matrix]):
18 | """
19 | Calculates the similarity between v1 and v2
20 | """
21 | raise NotImplementedError
22 |
23 |
24 | class CosineSimilarity(Similarity):
25 | """
26 | Computes cosine similarity
27 | """
28 | def __init__(self):
29 | super().__init__()
30 |
31 | def perform(self, v1: Union[np.ndarray, sparse.csr_matrix], v2: Union[np.ndarray, sparse.csr_matrix]):
32 | """
33 | Calculates the cosine similarity between v1 and v2
34 |
35 | Args:
36 | v1: first numpy array
37 | v2: second numpy array
38 | """
39 |
40 | return cosine_similarity(v1, v2, dense_output=True)
41 |
42 | def __str__(self):
43 | return "CosineSimilarity"
44 |
45 | def __repr__(self):
46 | return f"CosineSimilarity()"
47 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/contextualized_embeddings.md:
--------------------------------------------------------------------------------
1 | # Contextualized Embeddings
2 |
3 | Via the following, you can obtain embeddings of *finer* granularity from models which are able to return also
4 | embeddings of *coarser* granularity (e.g. obtain word embeddings from a model which is also able to return sentence
5 | embeddings).
6 |
7 | For now only models working at sentence and token level are implemented
8 |
9 | ```python
10 | from clayrs import content_analyzer as ca
11 |
12 | # obtain sentence embeddings combining token embeddings with a
13 | # centroid technique
14 | ca.Sentence2WordEmbedding(embedding_source=ca.BertTransformers('bert-base-uncased'))
15 | ```
16 |
17 | ::: clayrs.content_analyzer.Sentence2WordEmbedding
18 | handler: python
19 | options:
20 | show_root_toc_entry: true
21 | show_root_heading: true
22 |
23 | ## Model able to return sentence and token embeddings
24 |
25 | ::: clayrs.content_analyzer.BertTransformers
26 | handler: python
27 | options:
28 | heading_level: 3
29 | show_root_toc_entry: true
30 | show_root_heading: true
31 |
32 | ::: clayrs.content_analyzer.T5Transformers
33 | handler: python
34 | options:
35 | heading_level: 3
36 | show_root_toc_entry: true
37 | show_root_heading: true
38 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/utils/id_merger.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 |
4 | def id_values_merger(id_values: List[str]):
5 | """
6 | This function is used to compact a list of ids into a unique string. This can be useful when
7 | there is content whose id is composed by values coming from more than one field.
8 |
9 | Args:
10 | id_values (List): List containing one or more ids
11 |
12 | Returns:
13 | id_merged (str): String in which the values contained in the list given in input are
14 | merged
15 | """
16 | if type(id_values) == str or type(id_values) == int:
17 | return str(id_values)
18 | elif type(id_values) == list:
19 | id_merged = ""
20 | for i in range(len(id_values)):
21 | id_merged += str(id_values[i])
22 | if i != len(id_values) - 1:
23 | id_merged += "_"
24 | return id_merged
25 | else:
26 | raise TypeError("id must be an integer, a string or a list of strings and/or integer")
27 |
28 |
29 | def id_merger(raw_content: dict, field_list: List[str]) -> str:
30 | """
31 | Function that creates the list of ids and then calls id_values_merger to create a unique id
32 | """
33 | id_values = []
34 | for field_name in field_list:
35 | id_values.append(raw_content[field_name])
36 |
37 | return id_values_merger(id_values)
38 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/recsys/graph_based/feature_selection.md:
--------------------------------------------------------------------------------
1 | # Feature Selection
2 |
3 | Via the `feature_selecter` function you are able to perform feature selection on a given graph, by keeping properties
4 | that are the most important according to a given ***feature selection algorithm***. Check the documentation of the
5 | method for more and for a *usage example*
6 |
7 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_fn
8 | handler: python
9 |
10 | ---
11 |
12 | ## Feature Selection algorithms
13 |
14 | The following are the feature selection algorithms you can use in the `fs_algorithms_user`
15 | and/or in the `fs_algorithm_item`
16 |
17 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_alg.TopKPageRank
18 | handler: python
19 | options:
20 | heading_level: 3
21 | show_root_toc_entry: true
22 | show_root_heading: true
23 | members: none
24 |
25 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_alg.TopKEigenVectorCentrality
26 | handler: python
27 | options:
28 | heading_level: 3
29 | show_root_toc_entry: true
30 | show_root_heading: true
31 | members: none
32 |
33 | ::: clayrs.recsys.graphs.feature_selection.feature_selection_alg.TopKDegreeCentrality
34 | handler: python
35 | options:
36 | heading_level: 3
37 | show_root_toc_entry: true
38 | show_root_heading: true
39 | members: none
40 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_loader/gensim.py:
--------------------------------------------------------------------------------
1 | from gensim import downloader
2 | import numpy as np
3 |
4 | from clayrs.utils.const import logger
5 | from clayrs.content_analyzer.embeddings.embedding_loader.embedding_loader import WordEmbeddingLoader
6 |
7 |
8 | class Gensim(WordEmbeddingLoader):
9 | """
10 | Class that produces word embeddings using gensim pre-trained models.
11 |
12 | The model will be automatically downloaded using the gensim downloader api if not present locally.
13 |
14 | Args:
15 | model_name: Name of the model to load/download
16 | """
17 |
18 | def __init__(self, model_name: str = 'glove-twitter-25'):
19 | super().__init__(model_name)
20 |
21 | def get_vector_size(self) -> int:
22 | return self.model.vector_size
23 |
24 | def get_embedding(self, word: str) -> np.ndarray:
25 | return self.model[word]
26 |
27 | def load_model(self):
28 | # if the reference isn't in the possible models, FileNotFoundError is raised
29 | if self.reference in downloader.info()['models']:
30 | logger.info(f"Downloading/Loading {str(self)}")
31 |
32 | return downloader.load(self.reference)
33 | else:
34 | raise FileNotFoundError
35 |
36 | def __str__(self):
37 | return f"Gensim {self.reference}"
38 |
39 | def __repr__(self):
40 | return f'Gensim(model_name={self.reference}'
41 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/information_preprocessors/postprocessors/postprocessor.md:
--------------------------------------------------------------------------------
1 | # Postprocessor
2 |
3 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.CountVisualBagOfWords
4 | handler: python
5 | options:
6 | show_root_toc_entry: true
7 | show_root_heading: true
8 |
9 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.TfIdfVisualBagOfWords
10 | handler: python
11 | options:
12 | show_root_toc_entry: true
13 | show_root_heading: true
14 |
15 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.ScipyVQ
16 | handler: python
17 | options:
18 | show_root_toc_entry: true
19 | show_root_heading: true
20 |
21 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.SkLearnPCA
22 | handler: python
23 | options:
24 | show_root_toc_entry: true
25 | show_root_heading: true
26 |
27 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.SkLearnGaussianRandomProjections
28 | handler: python
29 | options:
30 | show_root_toc_entry: true
31 | show_root_heading: true
32 |
33 | ::: clayrs.content_analyzer.information_processor.postprocessors.postprocessor.SkLearnFeatureAgglomeration
34 | handler: python
35 | options:
36 | show_root_toc_entry: true
37 | show_root_heading: true
38 |
--------------------------------------------------------------------------------
/test/content_analyzer/information_processor/test_visualpreprocessors/test_torch_builtin_augmenter.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from torchvision import transforms
4 |
5 | import clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_augmenter as clayrs_augments
6 |
7 | from test.content_analyzer.information_processor.test_visualpreprocessors.test_torch_builtin_transformer import \
8 | TestTorchBuiltInTransformer
9 |
10 |
11 | class TestTorchAutoAugment(TestTorchBuiltInTransformer):
12 |
13 | def setUp(self):
14 | self.technique = clayrs_augments.TorchAutoAugment()
15 | self.og_technique = transforms.AutoAugment()
16 |
17 | def test_forward(self):
18 | self.assertTrue(self.expected_result_equal())
19 |
20 |
21 | class TestTorchRandAugment(TestTorchBuiltInTransformer):
22 |
23 | def setUp(self):
24 | self.technique = clayrs_augments.TorchRandAugment()
25 | self.og_technique = transforms.RandAugment()
26 |
27 | def test_forward(self):
28 | self.assertTrue(self.expected_result_equal())
29 |
30 |
31 | class TestTorchTrivialAugmentWide(TestTorchBuiltInTransformer):
32 |
33 | def setUp(self):
34 | self.technique = clayrs_augments.TorchTrivialAugmentWide()
35 | self.og_technique = transforms.TrivialAugmentWide()
36 |
37 | def test_forward(self):
38 | self.assertTrue(self.expected_result_equal())
39 |
40 |
41 | if __name__ == "__main__":
42 | unittest.main()
43 |
--------------------------------------------------------------------------------
/clayrs/utils/custom_logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import Optional, Dict
3 |
4 | from colorama import Fore, Back, Style
5 |
6 |
7 | class ColoredFormatter(logging.Formatter):
8 | """Colored log formatter."""
9 |
10 | def __init__(self, *args, colors: Optional[Dict[str, str]] = None, **kwargs) -> None:
11 | """Initialize the formatter with specified format strings."""
12 |
13 | super().__init__(*args, **kwargs)
14 |
15 | self.colors = colors if colors else {}
16 |
17 | def format(self, record) -> str:
18 | """Format the specified record as text."""
19 |
20 | record.color = self.colors.get(record.levelname, '')
21 | record.reset = Style.RESET_ALL
22 |
23 | return super().format(record)
24 |
25 |
26 | formatter = ColoredFormatter(
27 | "\r{color}{levelname}{reset} - {message}",
28 | style='{',
29 | colors={
30 | 'DEBUG': Fore.CYAN,
31 | 'INFO': Fore.RESET,
32 | 'WARNING': Fore.YELLOW,
33 | 'ERROR': Fore.RED,
34 | 'CRITICAL': Fore.BLACK + Back.RED + Style.BRIGHT,
35 | }
36 | )
37 |
38 |
39 | def get_custom_logger(name: str):
40 | handler = logging.StreamHandler()
41 | handler.setFormatter(formatter)
42 |
43 | logger = logging.getLogger(name)
44 | if len(logger.handlers) != 0:
45 | logger.handlers.clear()
46 |
47 | logger.addHandler(handler)
48 | logger.setLevel(logging.INFO)
49 | logger.propagate = False
50 | return logger
51 |
--------------------------------------------------------------------------------
/clayrs/recsys/content_based_algorithm/exceptions.py:
--------------------------------------------------------------------------------
1 | class UserSkipAlgFit(Exception):
2 | """
3 | Super class for exception related to the fit of a single user. If one of the exception happens, the algorithm
4 | can't be fitted for the user, therefore will be skipped
5 | """
6 | pass
7 |
8 |
9 | class OnlyPositiveItems(UserSkipAlgFit):
10 | """
11 | Exception to raise when there's only positive items available locally for the user
12 | """
13 | pass
14 |
15 |
16 | class OnlyNegativeItems(UserSkipAlgFit):
17 | """
18 | Exception to raise when there's only negative items available locally for the user
19 | """
20 | pass
21 |
22 |
23 | class NoRatedItems(UserSkipAlgFit):
24 | """
25 | Exception to raise when there's no item available locally for the user
26 | """
27 | pass
28 |
29 |
30 | class EmptyUserRatings(UserSkipAlgFit):
31 | """
32 | Exception to raise when the user ratings is empty
33 | """
34 | pass
35 |
36 |
37 | class NotRankingAlg(Exception):
38 | """
39 | Exception to raise when the algorithm is not a ranking algorithm, but it is asked to rank
40 | """
41 | pass
42 |
43 |
44 | class NotPredictionAlg(Exception):
45 | """
46 | Exception to raise when the algorithm is not a prediction algorithm, but it is asked to predict
47 | """
48 | pass
49 |
50 |
51 | class NotFittedAlg(Exception):
52 | """
53 | Exception to raise when the algorithm has not been fitted
54 | """
55 | pass
56 |
--------------------------------------------------------------------------------
/clayrs/utils/class_utils.py:
--------------------------------------------------------------------------------
1 | from typing import Type, Set
2 | import inspect
3 |
4 |
5 | def get_all_implemented_subclasses(cls: Type) -> Set:
6 | """
7 | Method that retrieves all implemented subclasses of a given class
8 | (also considering subclasses of a subclass and so on)
9 |
10 | The method calls itself to find the subclasses of each subclass
11 |
12 | Args:
13 | cls (Type): class from which all implemented subclasses will be extracted
14 |
15 | Returns:
16 | set containing all of cls' implemented subclasses
17 | """
18 | return set([sub for sub in cls.__subclasses__() if not inspect.isabstract(sub)]).union(
19 | [sub for c in cls.__subclasses__() for sub in get_all_implemented_subclasses(c) if not inspect.isabstract(sub)])
20 |
21 |
22 | def get_all_implemented_classes(cls: Type) -> Set:
23 | """
24 | Method that retrieves all implemented subclasses of a given class
25 | (also considering subclasses of a subclass and so on)
26 |
27 | The method calls itself to find the subclasses of each subclass
28 |
29 | If the class passed as argument is not abstract, it is added to the set's results
30 |
31 | Args:
32 | cls (Type): class from which all implemented subclasses will be extracted
33 |
34 | Returns:
35 | set containing all of cls' implemented subclasses and cls itself if it is not abstract
36 | """
37 |
38 | classes = get_all_implemented_subclasses(cls)
39 |
40 | if not inspect.isabstract(cls):
41 | classes.add(cls)
42 |
43 | return classes
44 |
--------------------------------------------------------------------------------
/test/test_files/test_decode/movies_title_embedding.json:
--------------------------------------------------------------------------------
1 | [{"Title":"[0.10984,-0.72454,1.21200001,-0.16188,-0.77879,-0.31345001,-0.27814001,-0.27860001,0.33089,-0.62764001,0.31617999,0.34035999,-0.66911,-0.52311999,1.66120005,1.10749996,0.25200999,0.098685,0.96275002,0.66688001,-0.33248001,0.22236,0.67574,-1.01069999,0.27109]","Year":"1995","Rated":"PG","Released":"15 Dec 1995","Runtime":"104 min","Genre":"Adventure, Family, Fantasy","Director":"Joe Johnston","Writer":"Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Actors":"Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","Plot":"After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","Language":"English, French","Country":"USA","Awards":"4 wins & 9 nominations.","Poster":"https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","Ratings":[{"Source":"Internet Movie Database","Value":"6.9/10"},{"Source":"Rotten Tomatoes","Value":"53%"},{"Source":"Metacritic","Value":"39/100"}],"Metascore":"39","imdbRating":"6.9","imdbVotes":"260,909","imdbID":"tt0113497","Type":"movie","DVD":"25 Jan 2000","BoxOffice":"N/A","Production":"Sony Pictures Home Entertainment","Website":"N/A","Response":"True"}]
--------------------------------------------------------------------------------
/clayrs/content_analyzer/exceptions.py:
--------------------------------------------------------------------------------
1 | from functools import wraps
2 |
3 | import numpy as np
4 |
5 |
6 | def handler_score_not_float(func):
7 | """
8 | Handler that catches the above exception.
9 |
10 | Tries to run the functions normally, if one of the above exceptions is caught then it must return
11 | an empty frame for the user since predictions can't be calculated for it.
12 | """
13 | @wraps(func)
14 | def inner_function(*args, **kwargs):
15 | try:
16 | return func(*args, **kwargs)
17 | except ValueError:
18 | raise ValueError("The 'score' and 'timestamp' columns must contains numbers!\n"
19 | "Try to apply a score processor or change columns!") from None
20 |
21 | return inner_function
22 |
23 |
24 | def handler_empty_matrix(dtype):
25 |
26 | def handler_for_function(func):
27 | """
28 | Handler that catches the above exception.
29 |
30 | Tries to run the functions normally, if one of the above exceptions is caught then it must return
31 | an empty frame for the user since predictions can't be calculated for it.
32 | """
33 | @wraps(func)
34 | def inner_function(*args, **kwargs):
35 | try:
36 | return func(*args, **kwargs)
37 | except IndexError:
38 | return np.array([], dtype=dtype)
39 |
40 | return inner_function
41 |
42 | return handler_for_function
43 |
44 |
45 | class UserNone(Exception):
46 | pass
47 |
48 |
49 | class ItemNone(Exception):
50 | pass
51 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/test_embedding_source.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | import numpy as np
4 | from unittest import TestCase
5 | from math import isclose
6 |
7 |
8 | class TestEmbeddingSource(TestCase):
9 |
10 | # Will be used by several test involving embeddings
11 | def assertWordEmbeddingMatches(self, source, embedding: np.ndarray, word: str):
12 | # 'similar_by_vector()' returns a list with top n
13 | # words similar to the vector given. I'm interested only in the most similar
14 | # so n = 1
15 | # for example, top_1 will be in the following form ("title", 1.0)
16 | top_1 = source.model.similar_by_vector(embedding, 1)[0]
17 |
18 | # So I'm using indices to access the tuples values.
19 | # 'like' contains how similar is 'embedding_word' to the 'embedding' vector given
20 | embedding_word = top_1[0]
21 | like = top_1[1]
22 |
23 | # if the word associated with the embedding vector returned by the model doesn't match the word passed as
24 | # argument, AssertionError is raised
25 | if not embedding_word == word:
26 | raise AssertionError("Word %s is not %s" % (embedding_word, word))
27 |
28 | # Obviously due to approximation the conversion won't return the
29 | # exact word, but if the likelihood it's equal to 1 with a maximum error of 'abs_tol'
30 | # I'm assuming it's exactly that word
31 | if not isclose(like, 1, abs_tol=1e-6):
32 | raise AssertionError("Word %s and result word %s do not match" % (embedding_word, word))
33 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/word_embeddings.md:
--------------------------------------------------------------------------------
1 | # Word Embeddings
2 |
3 | Via the following, you can obtain embeddings of ***word*** granularity
4 |
5 | ```python
6 | from clayrs import content_analyzer as ca
7 |
8 | # obtain word embeddings using pre-trained model 'glove-twitter-50'
9 | # from Gensim library
10 | ca.WordEmbeddingTechnique(embedding_source=ca.Gensim('glove-twitter-50'))
11 | ```
12 |
13 | ::: clayrs.content_analyzer.WordEmbeddingTechnique
14 | handler: python
15 | options:
16 | show_root_toc_entry: true
17 | show_root_heading: true
18 |
19 |
20 | ## Word Embedding models
21 |
22 | ::: clayrs.content_analyzer.Gensim
23 | handler: python
24 | options:
25 | heading_level: 3
26 | show_root_toc_entry: true
27 | show_root_heading: true
28 |
29 | ::: clayrs.content_analyzer.GensimDoc2Vec
30 | handler: python
31 | options:
32 | heading_level: 3
33 | show_root_toc_entry: true
34 | show_root_heading: true
35 |
36 | ::: clayrs.content_analyzer.GensimFastText
37 | handler: python
38 | options:
39 | heading_level: 3
40 | show_root_toc_entry: true
41 | show_root_heading: true
42 |
43 | ::: clayrs.content_analyzer.GensimRandomIndexing
44 | handler: python
45 | options:
46 | heading_level: 3
47 | show_root_toc_entry: true
48 | show_root_heading: true
49 |
50 | ::: clayrs.content_analyzer.GensimWord2Vec
51 | handler: python
52 | options:
53 | heading_level: 3
54 | show_root_toc_entry: true
55 | show_root_heading: true
56 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_loader/sbert.py:
--------------------------------------------------------------------------------
1 | from sentence_transformers import SentenceTransformer
2 | import numpy as np
3 |
4 | from clayrs.content_analyzer.embeddings.embedding_loader.embedding_loader import SentenceEmbeddingLoader
5 | from clayrs.utils.const import logger
6 |
7 |
8 | class Sbert(SentenceEmbeddingLoader):
9 | """
10 | Class that produces sentences embeddings using sbert.
11 |
12 | The model will be automatically downloaded if not present locally.
13 |
14 | Args:
15 | model_name_or_file_path: name of the model to download or path where the model is stored
16 | locally
17 | """
18 |
19 | def __init__(self, model_name_or_file_path: str = 'paraphrase-distilroberta-base-v1'):
20 | super().__init__(model_name_or_file_path)
21 |
22 | def load_model(self):
23 | try:
24 | logger.info(f"Downloading/Loading {str(self)}")
25 |
26 | return SentenceTransformer(self.reference)
27 | except (OSError, AttributeError):
28 | raise FileNotFoundError("Model not found!")
29 |
30 | def get_vector_size(self) -> int:
31 | return self.model.get_sentence_embedding_dimension()
32 |
33 | def get_embedding(self, sentence: str) -> np.ndarray:
34 | return self.model.encode(sentence, show_progress_bar=False)
35 |
36 | def get_embedding_token(self, sentence: str) -> np.ndarray:
37 | raise NotImplementedError("The model chosen can't return token embeddings")
38 |
39 | def __str__(self):
40 | return f"Sbert {self.reference}"
41 |
42 | def __repr__(self):
43 | return f"Sbert(model_name_or_file_path={self.reference})"
44 |
--------------------------------------------------------------------------------
/test/utils/test_context_managers.py:
--------------------------------------------------------------------------------
1 | import time
2 | from unittest import TestCase
3 |
4 | import tqdm
5 |
6 | from clayrs.utils.context_managers import get_progbar, get_iterator_parallel
7 |
8 |
9 | class TestContextManagers(TestCase):
10 |
11 | def test_get_progbar(self):
12 |
13 | with get_progbar(range(50), total=50) as pbar:
14 | self.assertIsInstance(pbar, tqdm.tqdm)
15 |
16 | expected_bar_format = "{desc} {percentage:.0f}%|{bar}| {n:}/{total_fmt} [{elapsed}<{remaining}]"
17 | result_bar_format = pbar.bar_format
18 |
19 | expected_list = list(range(50))
20 | result_list = list(pbar)
21 |
22 | self.assertEqual(expected_bar_format, result_bar_format)
23 | self.assertEqual(expected_list, result_list)
24 |
25 | def test_get_iterator_parallel(self):
26 |
27 | def f(x):
28 | time.sleep(1)
29 |
30 | return x
31 |
32 | expected_list = list(range(5))
33 |
34 | # single cpu
35 | with get_iterator_parallel(1, f, list(range(5))) as it:
36 | result_list = list(it)
37 |
38 | self.assertEqual(expected_list, result_list)
39 |
40 | # multi cpu
41 | with get_iterator_parallel(2, f, list(range(5))) as it:
42 | result_list = list(it)
43 |
44 | self.assertEqual(expected_list, result_list)
45 |
46 | # multi cpu with progbar
47 | with get_iterator_parallel(2, f, list(range(5)), progress_bar=True, total=5) as pbar:
48 |
49 | self.assertIsInstance(pbar, tqdm.tqdm)
50 |
51 | result_list = list(pbar)
52 |
53 | self.assertEqual(expected_list, result_list)
54 |
--------------------------------------------------------------------------------
/test/test_files/users_70.dat:
--------------------------------------------------------------------------------
1 | 1::F::1::10::48067
2 | 2::M::56::16::70072
3 | 3::M::25::15::55117
4 | 4::M::45::7::02460
5 | 5::M::25::20::55455
6 | 6::F::50::9::55117
7 | 7::M::35::1::06810
8 | 8::M::25::12::11413
9 | 9::M::25::17::61614
10 | 10::F::35::1::95370
11 | 11::F::25::1::04093
12 | 12::M::25::12::32793
13 | 13::M::45::1::93304
14 | 14::M::35::0::60126
15 | 15::M::25::7::22903
16 | 16::F::35::0::20670
17 | 17::M::50::1::95350
18 | 18::F::18::3::95825
19 | 19::M::1::10::48073
20 | 20::M::25::14::55113
21 | 21::M::18::16::99353
22 | 22::M::18::15::53706
23 | 23::M::35::0::90049
24 | 24::F::25::7::10023
25 | 25::M::18::4::01609
26 | 26::M::25::7::23112
27 | 27::M::25::11::19130
28 | 28::F::25::1::14607
29 | 29::M::35::7::33407
30 | 30::F::35::7::19143
31 | 31::M::56::7::06840
32 | 32::F::25::0::19355
33 | 33::M::45::3::55421
34 | 34::F::18::0::02135
35 | 35::M::45::1::02482
36 | 36::M::25::3::94123
37 | 37::F::25::9::66212
38 | 38::F::18::4::02215
39 | 39::M::18::4::61820
40 | 40::M::45::0::10543
41 | 41::F::18::4::15116
42 | 42::M::25::8::24502
43 | 43::M::25::12::60614
44 | 44::M::45::17::98052
45 | 45::F::45::16::94110
46 | 46::M::18::19::75602
47 | 47::M::18::4::94305
48 | 48::M::25::4::92107
49 | 49::M::18::12::77084
50 | 50::F::25::2::98133
51 | 51::F::1::10::10562
52 | 52::M::18::4::72212
53 | 53::M::25::0::96931
54 | 54::M::50::1::56723
55 | 55::F::35::12::55303
56 | 56::M::35::20::60440
57 | 57::M::18::19::30350
58 | 58::M::25::2::30303
59 | 59::F::50::1::55413
60 | 60::M::50::1::72118
61 | 61::M::25::17::95122
62 | 62::F::35::3::98105
63 | 63::M::18::4::54902
64 | 64::M::18::1::53706
65 | 65::M::35::12::55803
66 | 66::M::25::18::57706
67 | 67::F::50::5::60181
68 | 68::M::18::4::53706
69 | 69::F::25::1::02143
70 | 70::M::18::4::53703
71 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 | from setuptools import setup
3 |
4 | with open('requirements.txt') as f:
5 | requirements = f.read().splitlines()
6 |
7 | with open("README.md", "r") as fh:
8 | long_description = fh.read()
9 |
10 | VERSION = "0.5.1"
11 |
12 | setup(name='clayrs',
13 | version=VERSION,
14 | license='GPL-3.0',
15 | author='Antonio Silletti, Elio Musacchio, Roberta Sallustio',
16 | install_requires=requirements,
17 | description='Complexly represent contents, build recommender systems, evaluate them. All in one place!',
18 | long_description=long_description,
19 | long_description_content_type="text/markdown",
20 | keywords=['recommender system', 'cbrs', 'evaluation', 'recsys'],
21 | url='https://github.com/swapUniba/ClayRS',
22 | include_package_data=True,
23 | packages=setuptools.find_packages(),
24 | python_requires='>=3.8',
25 |
26 | classifiers=[
27 | 'Development Status :: 3 - Alpha',
28 | 'Intended Audience :: Developers',
29 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
30 | 'Operating System :: OS Independent',
31 | 'Programming Language :: Python',
32 | 'Programming Language :: Python :: 3',
33 | 'Programming Language :: Python :: 3 :: Only',
34 | 'Programming Language :: Python :: 3.8',
35 | 'Programming Language :: Python :: 3.9',
36 | 'Programming Language :: Python :: 3.10',
37 | 'Topic :: Software Development :: Libraries',
38 | 'Topic :: Software Development :: Libraries :: Python Modules',
39 | 'Topic :: Software Development :: Testing :: Unit'
40 | ]
41 |
42 | )
43 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/content_analyzer/content_techniques/textual_techniques/embedding_techniques/combining_embeddings.md:
--------------------------------------------------------------------------------
1 | # Combine Embeddings
2 |
3 | Via the following, you can obtain embeddings of *coarser* granularity from models which return
4 | embeddings of *finer* granularity (e.g. obtain sentence embeddings from a model which returns word embeddings)
5 |
6 | ```python
7 | from clayrs import content_analyzer as ca
8 |
9 | # obtain sentence embeddings combining token embeddings with a
10 | # centroid technique
11 | ca.Word2SentenceEmbedding(embedding_source=ca.Gensim('glove-twitter-50'),
12 | combining_technique=ca.Centroid())
13 | ```
14 |
15 | ::: clayrs.content_analyzer.Word2SentenceEmbedding
16 | handler: python
17 | options:
18 | show_root_toc_entry: true
19 | show_root_heading: true
20 |
21 | ::: clayrs.content_analyzer.Word2DocEmbedding
22 | handler: python
23 | options:
24 | show_root_toc_entry: true
25 | show_root_heading: true
26 |
27 | ::: clayrs.content_analyzer.Sentence2DocEmbedding
28 | handler: python
29 | options:
30 | show_root_toc_entry: true
31 | show_root_heading: true
32 |
33 | ## Combining Techniques
34 |
35 | ::: clayrs.content_analyzer.Centroid
36 | handler: python
37 | options:
38 | heading_level: 3
39 | show_root_toc_entry: true
40 | show_root_heading: true
41 |
42 | ::: clayrs.content_analyzer.Sum
43 | handler: python
44 | options:
45 | heading_level: 3
46 | show_root_toc_entry: true
47 | show_root_heading: true
48 |
49 | ::: clayrs.content_analyzer.SingleToken
50 | handler: python
51 | options:
52 | heading_level: 3
53 | show_root_toc_entry: true
54 | show_root_heading: true
--------------------------------------------------------------------------------
/test/content_analyzer/ratings_manager/test_rating_processor.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from clayrs.content_analyzer.ratings_manager.score_processor import NumberNormalizer
3 |
4 |
5 | class TestNumberNormalizer(TestCase):
6 | def test_fit(self):
7 | scores = [1, 2, 5, 5, 3, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 10]
8 |
9 | result = []
10 | for score in scores:
11 | converted = NumberNormalizer(scale=(1, 10)).fit(score)
12 | result.append(converted)
13 |
14 | expected = [-1.0, -0.77777777, -0.11111111, -0.11111111,
15 | -0.55555555, -0.44444444, -0.42222222, -0.39999999,
16 | -0.37777777, -0.35555555, -0.33333333, 1.0]
17 |
18 | for expected_score, result_score in zip(expected, result):
19 | self.assertAlmostEqual(expected_score, result_score)
20 |
21 | # Test with rounding at the fourth digit
22 | result_rounded = []
23 | for score in scores:
24 | converted_rounded = NumberNormalizer(scale=(1, 10), decimal_rounding=4).fit(score)
25 | result_rounded.append(converted_rounded)
26 |
27 | expected_rounded = [-1.0, -0.7778, -0.1111, -0.1111, -0.5556,
28 | -0.4444, -0.4222, -0.4, -0.3778, -0.3556,
29 | -0.3333, 1.0]
30 |
31 | for expected_score_rounded, result_score_rounded in zip(expected_rounded, result_rounded):
32 | self.assertAlmostEqual(expected_score_rounded, result_score_rounded)
33 |
34 | def test_error(self):
35 |
36 | # 2 numbers must be passed
37 | with self.assertRaises(ValueError):
38 | NumberNormalizer(scale=(1,))
39 |
40 | with self.assertRaises(ValueError):
41 | NumberNormalizer(scale=(1, 2, 3))
42 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/first_steps/colab_examples.md:
--------------------------------------------------------------------------------
1 | # Colab examples
2 |
3 | The GitHub repository hosts some IPython notebooks to get you start and running with the framework!
4 |
5 | To use them you could use [Google colab](https://colab.research.google.com/?hl=it):
6 |
7 | * Go to Colab and open `File > Open notebook`
8 |
9 | { width="600" }
10 |
11 |
12 | * Then go to `GitHub` section, write ***swapUniba/ClayRS*** in the first text box and choose the example you want
13 | to run!
14 |
15 | { width="600" }
16 |
17 |
18 | ## Available examples
19 |
20 | All the following use the ***Movielens 100k*** dataset
21 |
22 | * `1_tfidf_centroid.ipynb`: the easiest example, a good starting point for newcomers of the framework.
23 | It guides you in how to represent via *TfIdf* technique a field of the *raw source*, how to instantiate a
24 | `CentroidVector` algorithm and how to evaluate recommendations generated with several state-of-the-art metrics;
25 |
26 | * `2_embeddings_randomforest.ipynb`: a slightly more complex example, where *several fields* are represented
27 | with *several techniques*, including ***embedding techniques***. For the recommendation phase a
28 | `Random Forest` classifier is used;
29 |
30 | * `3_graph_pagerank.ipynb`: it will guide you on how to perform *graph based recommendation* via `ClayRS`
31 | (how to instantiate a graph, how to manipulate it, how to load exogenous properties). The *Personalized PageRank*
32 | algorithm is used in the recsys phase;
33 |
34 | * `4_evaluate_other_recs.ipynb`: a *jolly* example which shows how to export results (and intermediate results)
35 | obtained by `ClayRS`, but also how to evaluate ***external*** recommendation lists (i.e. recommendations generated via
36 | other tools)
--------------------------------------------------------------------------------
/clayrs/evaluation/metrics/metrics.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from abc import ABC, abstractmethod
3 | from functools import wraps
4 | from typing import TYPE_CHECKING
5 |
6 | import numpy as np
7 |
8 | if TYPE_CHECKING:
9 | from clayrs.recsys.partitioning import Split
10 |
11 |
12 | class Metric(ABC):
13 | """
14 | Abstract class that generalize metric concept
15 |
16 | Every metric may need different kind of "prediction": some (eg. NDCG, MRR, etc.) may need recommendation lists in
17 | which the recsys ranks every unseen item, some (eg. MAE, RMSE, etc.) may need a score prediction where the recsys
18 | must predict the rating that a user would give to an unseen item.
19 | So a Metric category (subclass of this class) must implement the "eval_fit_recsys(...)" specifying its needs,
20 | while every single metric (subclasses of the metric category class) must implement the "perform(...)" method
21 | specifying how to execute the metric computation
22 | """
23 |
24 | @abstractmethod
25 | def __str__(self):
26 | raise NotImplementedError
27 |
28 | @abstractmethod
29 | def perform(self, split: Split):
30 | raise NotImplementedError
31 |
32 |
33 | def handler_different_users(func):
34 | """
35 | Handler that covers the case in which there are different users between the predictions and the truth of a split: in
36 | that case a ValueError exception is raised
37 | """
38 | @wraps(func)
39 | def inner_function(self, split, *args, **kwargs):
40 |
41 | if not np.array_equal(np.sort(split.pred.unique_user_id_column.flat),
42 | np.sort(split.truth.unique_user_id_column.flat)):
43 | raise ValueError("Predictions and truths must contain the same users!")
44 |
45 | return func(self, split, *args, **kwargs)
46 |
47 | return inner_function
48 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_loader/test_gensim_loader.py:
--------------------------------------------------------------------------------
1 | from random import random
2 | from unittest import mock
3 | from unittest.mock import patch, Mock, MagicMock
4 | import numpy as np
5 |
6 | from test.content_analyzer.embeddings.test_embedding_source import TestEmbeddingSource
7 | from clayrs.content_analyzer.embeddings.embedding_loader.gensim import Gensim
8 |
9 | result_matrix = {
10 | 'title': np.array([random() for _ in range(25)]),
11 | 'plot': np.array([random() for _ in range(25)])
12 | }
13 |
14 |
15 | def get_item(key):
16 | return result_matrix[key]
17 |
18 |
19 | def similar_by_vector(vector, n_to_find):
20 | for i, vec in enumerate(result_matrix.values()):
21 | if np.array_equal(vec, vector):
22 | return [(list(result_matrix.keys())[i], 1)]
23 |
24 |
25 | mocked_model = MagicMock()
26 | mocked_model.__getitem__.side_effect = get_item
27 | #mocked_model = MagicMock()
28 | mocked_model.similar_by_vector.side_effect = similar_by_vector
29 | mocked_model.vector_size = 25
30 |
31 |
32 | class TestGensimDownloader(TestEmbeddingSource):
33 |
34 | def test_load(self):
35 |
36 | with mock.patch('gensim.downloader.info', return_value={'models': 'glove-twitter-25'}):
37 | with mock.patch('gensim.downloader.load', return_value=mocked_model):
38 | source = Gensim('glove-twitter-25')
39 |
40 | # result is a matrix containing 2 rows, one for 'title', one for 'plot'
41 | result = source.load(["title", "plot"])
42 |
43 | # the expected shape of result is (2, 25):
44 | # 2 for words and 25 due to the model 'glove-twitter-25'
45 | expected_shape = (2, 25)
46 | self.assertEqual(expected_shape, result.shape)
47 |
48 | self.assertWordEmbeddingMatches(source, result[0], "title")
49 | self.assertWordEmbeddingMatches(source, result[1], "plot")
50 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_learner/word2vec.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from gensim.models import Word2Vec
4 |
5 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimWordEmbeddingLearner
6 |
7 |
8 | class GensimWord2Vec(GensimWordEmbeddingLearner):
9 | """
10 | Class that implements Word2Vec model thanks to the Gensim library.
11 |
12 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter.
13 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly
14 | represent
15 |
16 | If you'd like to save the model once trained, set the path in the `reference` parameter and set
17 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to
18 | produce contents in the current run
19 |
20 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/word2vec.html)
21 | to see what else can be customized
22 |
23 | Args:
24 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the
25 | trained model won't be saved after training and will only be used to produce contents in the current run
26 | auto_save: If True, the model will be saved in the path specified in `reference` parameter
27 | """
28 |
29 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs):
30 | super().__init__(reference, auto_save, ".kv", **kwargs)
31 |
32 | def fit_model(self, corpus: List):
33 | self.model = Word2Vec(sentences=corpus, **self.additional_parameters).wv
34 |
35 | def __str__(self):
36 | return "GensimWord2Vec"
37 |
38 | def __repr__(self):
39 | return f'GensimWord2Vec(attributes={str(self.model)})'
40 |
--------------------------------------------------------------------------------
/clayrs/utils/load_content.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import lzma
3 | import os
4 | import pickle
5 |
6 | from clayrs.content_analyzer.content_representation.representation_container import RepresentationContainer
7 | from clayrs.content_analyzer.content_representation.content import Content
8 |
9 |
10 | def load_content_instance(directory: str, content_id: str, only_field_representations: dict = None) -> Content:
11 | """
12 | Loads a serialized content
13 | Args:
14 | directory: Path to the directory in which the content is stored
15 | content_id: ID of the content to load (its filename)
16 | only_field_representations: Specify exactly which representation to load for the content
17 | (e.g. {'Plot': 0, 'Genres': 1}). Useful for alleviating memory load
18 |
19 | Returns:
20 | content (Content)
21 | """
22 | try:
23 | content_filename = os.path.join(directory, '{}.xz'.format(content_id))
24 | with lzma.open(content_filename, "rb") as content_file:
25 | content = pickle.load(content_file)
26 |
27 | if only_field_representations is not None:
28 | smaller_content = Content(content_id)
29 | field_dict_smaller = {}
30 | for field, repr_id_list in only_field_representations.items():
31 | field_dict_smaller[field] = [content.get_field_representation(field, repr_id)
32 | for repr_id in repr_id_list]
33 |
34 | for field, repr_list in field_dict_smaller.items():
35 | ext_id_list = [id if isinstance(id, str) else None for id in only_field_representations[field]]
36 | field_repr_container = RepresentationContainer(repr_list, ext_id_list)
37 | smaller_content.append_field(field, field_repr_container)
38 |
39 | content = smaller_content
40 |
41 | except FileNotFoundError:
42 | content = None
43 |
44 | return content
45 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_learner/fasttext.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from gensim.models.fasttext import FastText
4 |
5 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimWordEmbeddingLearner
6 |
7 |
8 | class GensimFastText(GensimWordEmbeddingLearner):
9 | """
10 | Class that implements FastText model thanks to the Gensim library.
11 |
12 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter.
13 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly
14 | represent
15 |
16 | If you'd like to save the model once trained, set the path in the `reference` parameter and set
17 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to
18 | produce contents in the current run
19 |
20 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/fasttext.html)
21 | to see what else can be customized
22 |
23 | Args:
24 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the
25 | trained model won't be saved after training and will only be used to produce contents in the current run
26 | auto_save: If True, the model will be saved in the path specified in `reference` parameter
27 | """
28 |
29 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs):
30 | super().__init__(reference, auto_save, ".kv", **kwargs)
31 |
32 | def fit_model(self, corpus: List):
33 | self.model = FastText(sentences=corpus, **self.additional_parameters).wv
34 |
35 | def __str__(self):
36 | return "FastText"
37 |
38 | def __repr__(self):
39 | return f"FastText(reference={self.reference}, auto_save={self._auto_save}, " \
40 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})"
41 |
--------------------------------------------------------------------------------
/test/recsys/content_based_algorithm/test_contents_loader.py:
--------------------------------------------------------------------------------
1 | import os
2 | import unittest
3 | from os import listdir
4 | from os.path import splitext, isfile, join
5 |
6 | from clayrs.content_analyzer import SearchIndex
7 | from clayrs.recsys.content_based_algorithm.contents_loader import LoadedContentsDict, LoadedContentsIndex
8 | from test import dir_test_files
9 |
10 |
11 | class TestLoadedContentsDict(unittest.TestCase):
12 | def test_all(self):
13 | # test load_available_contents for content based algorithm
14 | movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/')
15 |
16 | interface_dict = LoadedContentsDict(movies_dir)
17 |
18 | # we are testing get_contents_interface
19 | self.assertIsInstance(interface_dict.get_contents_interface(), dict)
20 |
21 | # since we didn't specify which items to load, we expected it has loaded all items from the folder
22 | expected = {splitext(filename)[0]
23 | for filename in listdir(movies_dir)
24 | if isfile(join(movies_dir, filename)) and splitext(filename)[1] == ".xz"}
25 |
26 | # we are testing also iter
27 | result = set(interface_dict)
28 | self.assertEqual(expected, result)
29 |
30 | # test loaded contents specified
31 | interface_dict = LoadedContentsDict(movies_dir, {'tt0112281', 'tt0112302'})
32 |
33 | # we are testing len
34 | self.assertTrue(len(interface_dict) == 2)
35 |
36 | # we are testing getitem
37 | self.assertIsNotNone(interface_dict['tt0112281'])
38 | self.assertIsNotNone(interface_dict['tt0112302'])
39 |
40 | # we are testing get
41 | self.assertIsNotNone(interface_dict.get('tt0112281'))
42 | self.assertIsNone(interface_dict.get('should be None'))
43 |
44 |
45 | class TestLoadedContentsIndex(unittest.TestCase):
46 | def test_all(self):
47 | index = "../test/test_files/index"
48 |
49 | self.assertIsInstance(LoadedContentsIndex(index).get_contents_interface(), SearchIndex)
50 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_learner/doc2vec.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from gensim.models.doc2vec import Doc2Vec, TaggedDocument
4 |
5 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimWordEmbeddingLearner
6 |
7 |
8 | class GensimDoc2Vec(GensimWordEmbeddingLearner):
9 | """
10 | Class that implements Doc2Vec model thanks to the Gensim library.
11 |
12 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter.
13 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly
14 | represent
15 |
16 | If you'd like to save the model once trained, set the path in the `reference` parameter and set
17 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to
18 | produce contents in the current run
19 |
20 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/doc2vec.html)
21 | to see what else can be customized
22 |
23 | Args:
24 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the
25 | trained model won't be saved after training and will only be used to produce contents in the current run
26 | auto_save: If True, the model will be saved in the path specified in `reference` parameter
27 | """
28 |
29 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs):
30 | super().__init__(reference, auto_save, ".kv", **kwargs)
31 |
32 | def fit_model(self, corpus: List):
33 | tagged_data = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus)]
34 | self.model = Doc2Vec(tagged_data, **self.additional_parameters).wv
35 |
36 | def __str__(self):
37 | return "GensimDoc2Vec"
38 |
39 | def __repr__(self):
40 | return f"GensimDoc2Vec(reference={self.reference}, auto_save={self._auto_save}, " \
41 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})"
42 |
--------------------------------------------------------------------------------
/test/content_analyzer/field_content_production_techniques/embedding_technique/test_combining_technique.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import numpy as np
3 |
4 | from clayrs.content_analyzer.field_content_production_techniques.embedding_technique.combining_technique import \
5 | Centroid, Sum, SingleToken
6 |
7 |
8 | class TestCentroid(TestCase):
9 | def test_combine(self):
10 | z = np.ndarray(shape=(3, 3))
11 |
12 | z[0, :] = [1, 1, 1]
13 | z[1, :] = [2, 2, 2]
14 | z[2, :] = [3, 3, 3]
15 |
16 | combiner = Centroid()
17 | result = combiner.combine(z)
18 |
19 | expected = np.ndarray(shape=(3, ))
20 | expected[:] = [2, 2, 2]
21 |
22 | self.assertTrue((result == expected).all())
23 |
24 |
25 | class TestSum(TestCase):
26 | def test_combine(self):
27 | z = np.ndarray(shape=(3, 3))
28 |
29 | z[0, :] = [1, 9, 1]
30 | z[1, :] = [7, 2, 4]
31 | z[2, :] = [3, 5, 3]
32 |
33 | combiner = Sum()
34 | result = combiner.combine(z)
35 |
36 | expected = np.ndarray(shape=(3, ))
37 | expected[:] = [11, 16, 8]
38 |
39 | self.assertTrue((result == expected).all())
40 |
41 |
42 | class TestSingleToken(TestCase):
43 | def test_combine(self):
44 | z = np.ndarray(shape=(3, 3))
45 |
46 | z[0, :] = [1, 9, 1]
47 | z[1, :] = [7, 2, 4]
48 | z[2, :] = [3, 5, 3]
49 |
50 | combiner = SingleToken(0)
51 | result = combiner.combine(z)
52 |
53 | expected = np.ndarray(shape=(3, ))
54 | expected[:] = [1, 9, 1]
55 |
56 | self.assertTrue((result == expected).all())
57 |
58 | combiner = SingleToken(2)
59 | result = combiner.combine(z)
60 |
61 | expected = np.ndarray(shape=(3, ))
62 | expected[:] = [3, 5, 3]
63 |
64 | self.assertTrue((result == expected).all())
65 |
66 | def test_raise(self):
67 | z = np.ndarray(shape=(3, 3))
68 |
69 | z[0, :] = [1, 9, 1]
70 | z[1, :] = [7, 2, 4]
71 | z[2, :] = [3, 5, 3]
72 |
73 | with self.assertRaises(IndexError):
74 | SingleToken(99).combine(z)
75 |
--------------------------------------------------------------------------------
/docs/mkdocs/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | hide:
3 | - navigation
4 | - toc
5 | ---
6 |
7 | !!! warning
8 |
9 | Docs are complete, but revision is still a Work in Progress. Sorry for any typos!
10 |
11 |
12 |
13 |
14 |
15 | # Welcome to ClayRS's documentation!
16 |
17 | [](https://github.com/swapUniba/ClayRS/actions/workflows/testing_pipeline.yml)
18 | [](https://swapuniba.github.io/ClayRS/)
19 | [](https://codecov.io/gh/swapUniba/ClayRS)
20 | [](https://www.python.org/downloads/)
21 |
22 | ***ClayRS*** is a python framework for (mainly) content-based recommender systems which allows you to perform several operations, starting from a raw representation of users and items to building and evaluating a recommender system. It also supports graph-based recommendation with feature selection algorithms and graph manipulation methods.
23 |
24 | The framework has three main modules, which you can also use individually:
25 |
26 |
27 |
28 |
29 |
30 | Given a raw source, the ***Content Analyzer***:
31 |
32 | * Creates and serializes contents,
33 | * Using the chosen configuration
34 |
35 | The ***RecSys*** module allows to:
36 |
37 | * Instantiate a recommender system
38 | * *Using items and users serialized by the Content Analyzer*
39 | * Make score *prediction* or *recommend* items for the active user(s)
40 |
41 | The ***EvalModel*** has the task of evaluating a recommender system, using several state-of-the-art metrics
42 |
43 | The various sections of this documentation will guide you in becoming a full expert of **ClayRS**!
44 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_loader/vector_strategy.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 |
3 | import numpy as np
4 | import torch
5 |
6 |
7 | class VectorStrategy(ABC):
8 | """
9 | Useful class in order to abstract the strategies that use more layers in order to obtain an only final
10 | representation of the model
11 | """
12 | def __init__(self, last_interesting_layers: int):
13 | self.last_interesting_layers = last_interesting_layers
14 |
15 | @abstractmethod
16 | def build_embedding(self, token_embeddings) -> np.ndarray:
17 | raise NotImplementedError
18 |
19 |
20 | class SumStrategy(VectorStrategy):
21 | """
22 | Class which sums the `last_interesting_layers` of the output obtained by the Transformer model
23 |
24 | Args:
25 | last_interesting_layers: Which layer to sum in order to summarize information
26 | """
27 | def __init__(self, last_interesting_layers: int):
28 | super().__init__(last_interesting_layers)
29 |
30 | def build_embedding(self, token_embeddings: torch.Tensor) -> np.ndarray:
31 | token_vecs_sum = []
32 | for token in token_embeddings:
33 | sum_vec = torch.sum(token[-self.last_interesting_layers:], dim=0)
34 | token_vecs_sum.append(sum_vec)
35 | return torch.stack(token_vecs_sum).numpy()
36 |
37 | def __str__(self):
38 | return "SumStrategy"
39 |
40 | def __repr__(self):
41 | return f"SumStrategy(last_interesting_layers={self.last_interesting_layers})"
42 |
43 |
44 | class CatStrategy(VectorStrategy):
45 | """
46 | Class which concatenate the `last_interesting_layers` of the output obtained by the Transformer model
47 |
48 | Args:
49 | last_interesting_layers: Which layer to concatenate in order to summarize information
50 | """
51 | def __init__(self, last_interesting_layers: int):
52 | super().__init__(last_interesting_layers)
53 |
54 | def build_embedding(self, token_embeddings: torch.Tensor) -> np.ndarray:
55 | token_vecs_cat = []
56 | for token in token_embeddings:
57 | cat_vec = token[-1]
58 | for i in range(-2, -self.last_interesting_layers - 1, -1):
59 | cat_vec = torch.cat((cat_vec, token[i]), dim=0)
60 | token_vecs_cat.append(cat_vec)
61 | return torch.stack(token_vecs_cat).numpy()
62 |
63 | def __str__(self):
64 | return "CatStrategy"
65 |
66 | def __repr__(self):
67 | return f"CatStrategy(last_interesting_layers={self.last_interesting_layers})"
68 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_learner/test_random_indexing.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | import os
4 |
5 | import gensim
6 | import numpy as np
7 | from gensim.corpora import Dictionary
8 | from gensim.models import RpModel
9 | from gensim.test.utils import common_texts
10 | from clayrs.content_analyzer.embeddings.embedding_learner.random_indexing import GensimRandomIndexing
11 |
12 | num_topics = 10
13 | model_path = 'test_model_ri'
14 |
15 |
16 | class TestRandomIndexing(TestCase):
17 | def test_all(self):
18 | my_learner = GensimRandomIndexing(model_path, num_topics=num_topics)
19 |
20 | corpus = common_texts
21 | my_learner.fit_model(corpus)
22 |
23 | # check that vector size is correct
24 | self.assertEqual(num_topics, my_learner.get_vector_size())
25 |
26 | common_dictionary = Dictionary(common_texts)
27 | common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
28 | expected_learner = RpModel(common_corpus, num_topics=num_topics)
29 |
30 | # test get_embedding not existent document
31 | unseen_doc_text = ['this', 'is', 'a', 'new', 'document', 'which', 'doesnt', 'exist']
32 |
33 | # check that the doc is unseen (embedding has len 0)
34 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text)
35 | expected = expected_learner[unseen_doc]
36 | self.assertTrue(len(expected) == 0)
37 |
38 | # in our framework if the doc is unseen KeyError is raised
39 | with self.assertRaises(KeyError):
40 | my_learner.get_embedding(unseen_doc_text)
41 |
42 | # test get_embedding existent document
43 | unseen_doc_text = ['human', 'time', 'trees']
44 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text)
45 | expected = expected_learner[unseen_doc]
46 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics)
47 | result_vector = my_learner.get_embedding(unseen_doc_text)
48 |
49 | # we don't have a way to check if the 2 vectors are the same, because they are build at random.
50 | # We just check that they are of the same length
51 |
52 | self.assertEqual(len(expected_vector), len(result_vector))
53 |
54 | # test save
55 | my_learner.save()
56 | self.assertTrue(os.path.isfile(f"{model_path}.model"))
57 |
58 | # test that after load we obtain a valid embedding
59 | my_learner_loaded = GensimRandomIndexing(model_path)
60 | my_learner_loaded.load_model()
61 | unseen_doc_text = ['human', 'time', 'trees']
62 | result_vector = my_learner.get_embedding(unseen_doc_text)
63 |
64 | self.assertTrue(np.any(result_vector))
65 |
--------------------------------------------------------------------------------
/test/test_files/complex_contents/create_complex_contents.py:
--------------------------------------------------------------------------------
1 | import clayrs.content_analyzer as ca
2 |
3 | items_json = "../movies_info_reduced.json"
4 | users_dat = "../users_70.dat"
5 |
6 |
7 | def item_fit():
8 | config = ca.ItemAnalyzerConfig(
9 | ca.JSONFile(items_json),
10 | id='imdbID',
11 | output_directory='movies_codified/',
12 | export_json=True
13 | )
14 |
15 | config.add_multiple_config(
16 | 'Plot',
17 | [
18 | ca.FieldConfig(ca.SkLearnTfIdf(),
19 | ca.NLTK(stopwords_removal=True), id='tfidf'),
20 | ca.FieldConfig(ca.SentenceEmbeddingTechnique(ca.Sbert('paraphrase-distilroberta-base-v1')),
21 | ca.NLTK(stopwords_removal=True), id='embedding'),
22 | ca.FieldConfig(ca.OriginalData(), id='index_original', memory_interface=ca.SearchIndex('index')),
23 | ca.FieldConfig(ca.OriginalData(), ca.NLTK(stopwords_removal=True),
24 | id='index_preprocessed', memory_interface=ca.SearchIndex('index')),
25 | ]
26 | )
27 |
28 | config.add_multiple_config(
29 | 'Genre',
30 | [
31 | ca.FieldConfig(ca.WordEmbeddingTechnique(ca.Gensim('glove-twitter-25')),
32 | ca.NLTK(stemming=True), id='embedding'),
33 | ca.FieldConfig(ca.WhooshTfIdf(),
34 | ca.NLTK(stemming=True), id='tfidf'),
35 | ca.FieldConfig(ca.OriginalData(), id='index_original', memory_interface=ca.SearchIndex('index')),
36 | ca.FieldConfig(ca.OriginalData(), ca.NLTK(stopwords_removal=True),
37 | memory_interface=ca.SearchIndex('index')),
38 | ]
39 | )
40 |
41 | config.add_multiple_config(
42 | 'Year',
43 | [
44 | ca.FieldConfig(ca.OriginalData(), id='default_string'),
45 | ca.FieldConfig(ca.OriginalData(dtype=int), id='int')
46 | ]
47 | )
48 |
49 | config.add_single_config(
50 | 'imdbRating',
51 | ca.FieldConfig(ca.OriginalData(dtype=float))
52 | )
53 |
54 | config.add_single_exogenous(
55 | ca.ExogenousConfig(ca.DBPediaMappingTechnique("dbo:Film", "Title"), id='dbpedia')
56 | )
57 |
58 | ca.ContentAnalyzer(config).fit()
59 |
60 |
61 | def users_fit():
62 | config = ca.UserAnalyzerConfig(
63 | ca.DATFile(users_dat),
64 | id='0',
65 | output_directory='users_codified',
66 | export_json=True
67 | )
68 |
69 | config.add_single_exogenous(
70 | ca.ExogenousConfig(ca.PropertiesFromDataset(), id='local')
71 | )
72 |
73 | ca.ContentAnalyzer(config).fit()
74 |
75 |
76 | if __name__ == "__main__":
77 | item_fit()
78 | users_fit()
79 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_learner/test_lda.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | import os
4 |
5 | import numpy as np
6 | from gensim.corpora import Dictionary
7 | from gensim.models import LdaModel
8 | from gensim.test.utils import common_texts
9 | import gensim
10 |
11 | from clayrs.content_analyzer.embeddings.embedding_learner.lda import GensimLDA
12 |
13 | # we fix random_state for reproducibility
14 | random_state = 42
15 | num_topics = 100
16 | model_path = 'test_model_lda'
17 |
18 |
19 | class TestLda(TestCase):
20 | def test_all(self):
21 | my_learner = GensimLDA(model_path, num_topics=num_topics, random_state=random_state)
22 |
23 | corpus = common_texts
24 | my_learner.fit_model(corpus)
25 |
26 | # check that vector size is correct
27 | self.assertEqual(num_topics, my_learner.get_vector_size())
28 |
29 | common_dictionary = Dictionary(common_texts)
30 | common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
31 | expected_learner = LdaModel(common_corpus, num_topics=num_topics, random_state=random_state)
32 |
33 | # test get_embedding not existent document
34 | unseen_doc_text = ['this', 'is', 'a', 'new', 'document', 'which', 'doesnt', 'exist']
35 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text)
36 | expected = expected_learner[unseen_doc]
37 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics)
38 |
39 | result_vector = my_learner.get_embedding(unseen_doc_text)
40 |
41 | self.assertTrue(np.array_equal(expected_vector, result_vector))
42 |
43 | # test get_embedding existent document
44 | unseen_doc_text = ['human', 'time', 'trees']
45 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text)
46 | expected = expected_learner[unseen_doc]
47 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics)
48 |
49 | result_vector = my_learner.get_embedding(unseen_doc_text)
50 |
51 | self.assertTrue(np.array_equal(expected_vector, result_vector))
52 |
53 | # test save
54 | my_learner.save()
55 | self.assertTrue(os.path.isfile(f"{model_path}.model"))
56 | self.assertTrue(os.path.isfile(f"{model_path}.model.expElogbeta.npy"))
57 | self.assertTrue(os.path.isfile(f"{model_path}.model.id2word"))
58 | self.assertTrue(os.path.isfile(f"{model_path}.model.state"))
59 |
60 | # test that after load we obtain a valid embedding
61 | my_learner_loaded = GensimLDA(model_path)
62 | my_learner_loaded.load_model()
63 | unseen_doc_text = ['human', 'time', 'trees']
64 | result_vector = my_learner.get_embedding(unseen_doc_text)
65 |
66 | self.assertTrue(np.any(result_vector))
67 |
--------------------------------------------------------------------------------
/test/content_analyzer/embeddings/embedding_learner/test_latent_semantic_analysis.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | import os
4 | import gensim
5 |
6 | import numpy as np
7 | from gensim.models import LsiModel
8 |
9 | from clayrs.content_analyzer.embeddings.embedding_learner.latent_semantic_analysis import GensimLatentSemanticAnalysis
10 | from gensim.corpora import Dictionary
11 | from gensim.test.utils import common_texts
12 |
13 | num_topics = 10
14 | model_path = 'test_model_lda'
15 |
16 |
17 | class TestLda(TestCase):
18 | def test_all(self):
19 | my_learner = GensimLatentSemanticAnalysis(model_path, num_topics=num_topics)
20 |
21 | corpus = common_texts
22 | my_learner.fit_model(corpus)
23 |
24 | # check that vector size is correct
25 | self.assertEqual(num_topics, my_learner.get_vector_size())
26 |
27 | common_dictionary = Dictionary(common_texts)
28 | common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
29 | expected_learner = LsiModel(common_corpus, num_topics=num_topics)
30 |
31 | # test get_embedding not existent document
32 | unseen_doc_text = ['this', 'is', 'a', 'new', 'document', 'which', 'doesnt', 'exist']
33 |
34 | # check that the doc is unseen (embedding has len 0)
35 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text)
36 | expected = expected_learner[unseen_doc]
37 | self.assertTrue(len(expected) == 0)
38 |
39 | # in our framework if the doc is unseen KeyError is raised
40 | with self.assertRaises(KeyError):
41 | my_learner.get_embedding(unseen_doc_text)
42 |
43 | # test get_embedding existent document
44 | unseen_doc_text = ['human', 'time', 'trees']
45 | unseen_doc = common_dictionary.doc2bow(unseen_doc_text)
46 | expected = expected_learner[unseen_doc]
47 | expected_vector: np.ndarray = gensim.matutils.sparse2full(expected, num_topics)
48 | result_vector = my_learner.get_embedding(unseen_doc_text)
49 |
50 | # we don't have a way to check if the 2 vectors are the same, because they are build at random.
51 | # We just check that they are of the same length
52 |
53 | self.assertEqual(len(expected_vector), len(result_vector))
54 |
55 | # test save
56 | my_learner.save()
57 | self.assertTrue(os.path.isfile(f"{model_path}.model"))
58 | self.assertTrue(os.path.isfile(f"{model_path}.model.projection"))
59 |
60 | # test that after load we obtain a valid embedding
61 | my_learner_loaded = GensimLatentSemanticAnalysis(model_path)
62 | my_learner_loaded.load_model()
63 | unseen_doc_text = ['human', 'time', 'trees']
64 | result_vector = my_learner.get_embedding(unseen_doc_text)
65 |
66 | self.assertTrue(np.any(result_vector))
67 |
68 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_learner/lda.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import gensim
4 | import numpy as np
5 | from gensim.corpora import Dictionary
6 | from gensim.models import LdaModel
7 |
8 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimDocumentEmbeddingLearner
9 | from clayrs.content_analyzer.utils.check_tokenization import check_tokenized
10 |
11 |
12 | class GensimLDA(GensimDocumentEmbeddingLearner):
13 | """
14 | Class that implements Latent Dirichlet Allocation (LDA) thanks to the Gensim library.
15 |
16 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter.
17 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly
18 | represent
19 |
20 | If you'd like to save the model once trained, set the path in the `reference` parameter and set
21 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to
22 | produce contents in the current run
23 |
24 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/ldamodel.html)
25 | to see what else can be customized
26 |
27 | Args:
28 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the
29 | trained model won't be saved after training and will only be used to produce contents in the current run
30 | auto_save: If True, the model will be saved in the path specified in `reference` parameter
31 | """
32 |
33 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs):
34 | super().__init__(reference, auto_save, ".model", **kwargs)
35 |
36 | def fit_model(self, corpus: List):
37 | dictionary = Dictionary(corpus)
38 | word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus]
39 | self.model = LdaModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters)
40 |
41 | def load_model(self):
42 | return LdaModel.load(self.reference)
43 |
44 | def get_vector_size(self) -> int:
45 | return self.model.num_topics
46 |
47 | def get_embedding(self, document_tokenized: List[str]) -> np.ndarray:
48 | unseen_doc = self.model.id2word.doc2bow(check_tokenized(document_tokenized))
49 | sparse_vector = self.model[unseen_doc]
50 |
51 | dense_vector: np.ndarray = gensim.matutils.sparse2full(sparse_vector, self.model.num_topics)
52 | return dense_vector
53 |
54 | def __str__(self):
55 | return "GensimLda"
56 |
57 | def __repr__(self):
58 | return f"GensimLda(reference={self.reference}, auto_save={self._auto_save}, " \
59 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})"
60 |
--------------------------------------------------------------------------------
/test/utils/test_automatic_methods.py:
--------------------------------------------------------------------------------
1 | import inspect
2 | import unittest
3 |
4 | from clayrs.utils.automatic_methods import autorepr
5 |
6 |
7 | class TestAutomaticMethods(unittest.TestCase):
8 |
9 | @classmethod
10 | def setUpClass(cls) -> None:
11 |
12 | # init method with only positional attributes
13 | class OnlyPositional:
14 | def __init__(self, attribute1, attribute2):
15 | self._repr_string = autorepr(self, inspect.currentframe())
16 |
17 | def __repr__(self):
18 | return self._repr_string
19 |
20 | cls.only_pos_class = OnlyPositional('formal1', 'formal2')
21 |
22 | # init method with only args attributes
23 | class OnlyArgs:
24 | def __init__(self, *args):
25 | self._repr_string = autorepr(self, inspect.currentframe())
26 |
27 | def __repr__(self):
28 | return self._repr_string
29 |
30 | cls.only_args_class = OnlyArgs('only_args1', 'only_args2')
31 |
32 | # init method with only kwargs attributes
33 | class OnlyKwargs:
34 | def __init__(self, **kwargs):
35 | self._repr_string = autorepr(self, inspect.currentframe())
36 |
37 | def __repr__(self):
38 | return self._repr_string
39 |
40 | cls.only_kwargs_class = OnlyKwargs(kwargs1='only_kwargs1', kwargs2='only_kwargs2')
41 |
42 | # init method with all possible attributes
43 | class AllPossibleArgs:
44 | def __init__(self, attribute1, attribute2, *args, **kwargs):
45 | self._repr_string = autorepr(self, inspect.currentframe())
46 |
47 | def __repr__(self):
48 | return self._repr_string
49 |
50 | cls.all_possible_args_class = AllPossibleArgs('formal1', 'formal2', 'args1', 'args2', 'args3',
51 | kwargs1='kwargs_val')
52 |
53 | def test_autorepr(self):
54 |
55 | expected = "OnlyPositional(attribute1='formal1', attribute2='formal2')"
56 | result = repr(self.only_pos_class)
57 |
58 | self.assertEqual(expected, result)
59 |
60 | expected = "OnlyArgs(*args='only_args1', *args='only_args2')"
61 | result = repr(self.only_args_class)
62 |
63 | self.assertEqual(expected, result)
64 |
65 | expected = "OnlyKwargs(*kwargs_kwargs1='only_kwargs1', *kwargs_kwargs2='only_kwargs2')"
66 | result = repr(self.only_kwargs_class)
67 |
68 | self.assertEqual(expected, result)
69 |
70 | expected = "AllPossibleArgs(attribute1='formal1', attribute2='formal2', *args='args1', " \
71 | "*args='args2', *args='args3', *kwargs_kwargs1='kwargs_val')"
72 | result = repr(self.all_possible_args_class)
73 |
74 | self.assertEqual(expected, result)
75 |
76 |
77 | if __name__ == '__main__':
78 | unittest.main()
79 |
--------------------------------------------------------------------------------
/test/recsys/content_based_algorithm/centroid_vector/test_similarities.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from clayrs.recsys.content_based_algorithm.centroid_vector.similarities import CosineSimilarity
3 | import numpy as np
4 | from sklearn.metrics.pairwise import cosine_similarity
5 | from scipy import sparse
6 |
7 |
8 | class TestCosineSimilarity(TestCase):
9 | def test_perform(self):
10 | sim = CosineSimilarity()
11 |
12 | # vector comparison
13 | a = np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]])
14 | b = np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6]])
15 |
16 | res = sim.perform(a, b)
17 | expected = cosine_similarity(a, b, dense_output=True)
18 |
19 | np.testing.assert_allclose(expected, res)
20 |
21 | # single vector vs one matrix comparison
22 | a = np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]])
23 | b = np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6],
24 | [8, 5, 5, 6, 2, 3, 10, 2, 3, 4],
25 | [1, 2, 2, 4, 4, 7, 6, 5, 5, 3]])
26 |
27 | res = sim.perform(a, b)
28 | expected = cosine_similarity(a, b, dense_output=True)
29 |
30 | # check that we compute a similarity for each pair
31 | self.assertTrue(res.shape[0] == 1 and res.shape[1] == 3)
32 |
33 | np.testing.assert_allclose(expected, res)
34 |
35 | # sparse comparison
36 | a = sparse.csr_matrix(np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]]))
37 | b = sparse.csr_matrix(np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6]]))
38 |
39 | res = sim.perform(a, b)
40 | expected = cosine_similarity(a, b, dense_output=True)
41 |
42 | np.testing.assert_allclose(expected, res)
43 |
44 | # single sparse vs sparse matrix comparison
45 | a = sparse.csr_matrix(np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]]))
46 | b = sparse.csr_matrix(np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6],
47 | [8, 5, 5, 6, 2, 3, 10, 2, 3, 4],
48 | [1, 2, 2, 4, 4, 7, 6, 5, 5, 3]]))
49 |
50 | res = sim.perform(a, b)
51 | expected = cosine_similarity(a, b, dense_output=True)
52 |
53 | # check that we compute a similarity for each pair
54 | self.assertTrue(res.shape[0] == 1 and res.shape[1] == 3)
55 |
56 | np.testing.assert_allclose(expected, res)
57 |
58 | # single vector vs sparse matrix comparison
59 | a = np.array([[5, 9, 7, 8, 3, 5, 4, 2, 6, 4]])
60 | b = sparse.csr_matrix(np.array([[8, 1, 3, 10, 8, 4, 9, 2, 1, 6],
61 | [8, 5, 5, 6, 2, 3, 10, 2, 3, 4],
62 | [1, 2, 2, 4, 4, 7, 6, 5, 5, 3]]))
63 |
64 | res = sim.perform(a, b)
65 | expected = cosine_similarity(a, b, dense_output=True)
66 |
67 | # check that we compute a similarity for each pair
68 | self.assertTrue(res.shape[0] == 1 and res.shape[1] == 3)
69 |
70 | np.testing.assert_allclose(expected, res)
71 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/ratings_manager/score_processor.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Tuple
3 | import numpy as np
4 |
5 |
6 | class ScoreProcessor(ABC):
7 | """
8 | Abstract class to process a rating with the personalized fit method
9 | """
10 | def __init__(self, decimal_rounding: int = None):
11 | self.__decimal_rounding = decimal_rounding
12 |
13 | @property
14 | def decimal_rounding(self):
15 | return self.__decimal_rounding
16 |
17 | @abstractmethod
18 | def fit(self, score_data: object):
19 | raise NotImplementedError
20 |
21 | def __repr__(self):
22 | return f'ScoreProcessor(decimal rounding={self.__decimal_rounding})'
23 |
24 |
25 | class SentimentAnalysis(ScoreProcessor):
26 | """
27 | Abstract Class that generalizes the sentiment analysis technique
28 | """
29 |
30 | @abstractmethod
31 | def fit(self, score_data: str):
32 | raise NotImplementedError
33 |
34 | @abstractmethod
35 | def __repr__(self):
36 | return f'SentimentAnalysis(decimal rounding={self.decimal_rounding})'
37 |
38 |
39 | class NumberNormalizer(ScoreProcessor):
40 | """
41 | Class that normalizes numeric scores to a scale in the range $[-1.0, 1.0]$
42 |
43 | Args:
44 | scale: Tuple where the first value is the minimum of the actual scale, second value is the maximum of the
45 | actual scale (e.g. `(1, 5)` represents an actual scale of scores from 1 (included) to 5 (included))
46 | decimal_rounding: If set, the normalized score will be rounded to the chosen decimal digit
47 | """
48 | def __init__(self, scale: Tuple[float, float], decimal_rounding: int = None):
49 | super().__init__(decimal_rounding)
50 |
51 | if len(scale) != 2:
52 | raise ValueError("The voting scale should be a tuple containing exactly two values,"
53 | "the minimum of the scale and the maximum!")
54 |
55 | self._old_min = scale[0]
56 | self._old_max = scale[1]
57 |
58 | def __str__(self):
59 | return "NumberNormalizer"
60 |
61 | def __repr__(self):
62 | return f'NumberNormalizer(scale=({self._old_min}, {self._old_max}), decimal rounding={self.decimal_rounding})'
63 |
64 | def fit(self, score_data: float) -> float:
65 | """
66 | Method which will normalize the given score
67 |
68 | Args:
69 | score_data: score that will be normalized
70 |
71 | Returns:
72 | score normalized in the interval $[-1, 1]$
73 | """
74 | def convert_into_range(value: float, old_min: float, old_max: float, new_min: int = -1, new_max: int = 1):
75 | new_value = ((value - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min
76 | if self.decimal_rounding:
77 | new_value = np.round(new_value, self.decimal_rounding)
78 |
79 | return new_value
80 |
81 | return convert_into_range(float(score_data), self._old_min, self._old_max)
82 |
--------------------------------------------------------------------------------
/clayrs/utils/context_managers.py:
--------------------------------------------------------------------------------
1 | import concurrent
2 | import concurrent.futures
3 | import os
4 | from concurrent.futures import as_completed
5 | from typing import Union, Iterator
6 |
7 | import distex
8 | import contextlib
9 |
10 | from tqdm import tqdm
11 | from tqdm.contrib.logging import logging_redirect_tqdm
12 |
13 | from clayrs.utils.const import logger
14 |
15 |
16 | @contextlib.contextmanager
17 | def get_progbar(iterator, total=None) -> tqdm:
18 | bar_format = "{desc} {percentage:.0f}%|{bar}| {n:}/{total_fmt} [{elapsed}<{remaining}]"
19 | with logging_redirect_tqdm(loggers=[logger]):
20 | with tqdm(iterator, bar_format=bar_format, total=total) as pbar:
21 | yield pbar
22 |
23 |
24 | def handle_exception(loop, context):
25 | # this is a simple hack to stopping asyncio from logging "task was never retrieved" exception
26 | # that should not happen in the first place.
27 | # In fact this problem happens only on specific scenarios like Pycharm interpreter, or by running
28 | # an asyncio snippet as script, but does not happen if the exact same script is run interactively,
29 | # or in IPython environment
30 | pass
31 |
32 |
33 | @contextlib.contextmanager
34 | def get_iterator_parallel(num_cpus, f_to_parallelize, *args_to_f,
35 | progress_bar=False, total=None) -> Union[Iterator, tqdm]:
36 |
37 | num_cpus = num_cpus or os.cpu_count() or 1
38 |
39 | if num_cpus > 1:
40 | pool = distex.Pool(num_workers=num_cpus, func_pickle=distex.PickleType.cloudpickle)
41 | pool._loop.set_exception_handler(handle_exception)
42 | iterator_res = pool.map(f_to_parallelize, *args_to_f)
43 | else:
44 | pool = None
45 | iterator_res = map(f_to_parallelize, *args_to_f)
46 |
47 | try:
48 | if progress_bar:
49 | with get_progbar(iterator_res, total=total) as pbar:
50 | yield pbar
51 | else:
52 | yield iterator_res
53 | finally:
54 | if pool is not None:
55 | pool.shutdown()
56 |
57 |
58 | @contextlib.contextmanager
59 | def get_iterator_thread(max_workers, f_to_thread, *args_to_f,
60 | keep_order=False, progress_bar=False, total=None) -> Union[Iterator, tqdm]:
61 |
62 | # min(32, (os.cpu_count() or 1) + 4) taken from ThreadPoolExecutor
63 | max_workers = max_workers or min(32, (os.cpu_count() or 1) + 4) or 1
64 |
65 | if max_workers > 1:
66 |
67 | ex = concurrent.futures.ThreadPoolExecutor(max_workers)
68 | if keep_order:
69 | iterator_res = ex.map(f_to_thread, *args_to_f)
70 | else:
71 | iterator_res = as_completed([ex.submit(f_to_thread, *args) for args in zip(*args_to_f)])
72 | else:
73 | ex = None
74 | iterator_res = map(f_to_thread, *args_to_f)
75 |
76 | try:
77 | if progress_bar:
78 | with get_progbar(iterator_res, total=total) as pbar:
79 | yield pbar
80 | else:
81 | yield iterator_res
82 | finally:
83 | if ex is not None:
84 | ex.shutdown()
85 |
--------------------------------------------------------------------------------
/test/content_analyzer/test_config.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from clayrs.content_analyzer.exogenous_properties_retrieval import PropertiesFromDataset
4 |
5 | from clayrs.content_analyzer import FieldConfig, ExogenousConfig
6 |
7 |
8 | class TestFieldConfig(TestCase):
9 | def test_invalid_id(self):
10 | with self.assertRaises(ValueError):
11 | FieldConfig(id='.in.vali.d')
12 |
13 | with self.assertRaises(ValueError):
14 | FieldConfig(id='#in#vali#d')
15 |
16 | with self.assertRaises(ValueError):
17 | FieldConfig(id=' ')
18 |
19 | with self.assertRaises(ValueError):
20 | FieldConfig(id='is invalid')
21 |
22 | with self.assertRaises(ValueError):
23 | FieldConfig(id='is/inva/lid')
24 |
25 | # ...and many more
26 |
27 | def test_valid_id(self):
28 | valid_object = FieldConfig(id='test')
29 | self.assertIsNotNone(valid_object)
30 |
31 | valid_object = FieldConfig(id='test_valid')
32 | self.assertIsNotNone(valid_object)
33 |
34 | valid_object = FieldConfig(id='test-valid')
35 | self.assertIsNotNone(valid_object)
36 |
37 | valid_object = FieldConfig(id='test1-valid2')
38 | self.assertIsNotNone(valid_object)
39 |
40 | valid_object = FieldConfig(id='1_2-3_')
41 | self.assertIsNotNone(valid_object)
42 |
43 | # ...and many more
44 |
45 |
46 | class TestExogenousConfig(TestCase):
47 | def test_invalid_id(self):
48 | with self.assertRaises(ValueError):
49 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d')
50 |
51 | with self.assertRaises(ValueError):
52 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d')
53 |
54 | with self.assertRaises(ValueError):
55 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d')
56 |
57 | with self.assertRaises(ValueError):
58 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d')
59 |
60 | with self.assertRaises(ValueError):
61 | ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='.in.vali.d')
62 |
63 | # ...and many more
64 |
65 | def test_valid_id(self):
66 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test')
67 | self.assertIsNotNone(valid_object)
68 |
69 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test_valid')
70 | self.assertIsNotNone(valid_object)
71 |
72 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test-valid')
73 | self.assertIsNotNone(valid_object)
74 |
75 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='test1-valid2')
76 | self.assertIsNotNone(valid_object)
77 |
78 | valid_object = ExogenousConfig(exogenous_technique=PropertiesFromDataset(), id='1_2-3_')
79 | self.assertIsNotNone(valid_object)
80 |
81 | # ...and many more
82 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/information_processor/visual_preprocessors/torch_builtin_augmenter.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 |
3 | from torchvision import transforms
4 | from torchvision.transforms import InterpolationMode, AutoAugmentPolicy
5 |
6 | from clayrs.content_analyzer.information_processor.visual_preprocessors.torch_builtin_transformer import \
7 | TorchBuiltInTransformer
8 |
9 | __all__ = [
10 | "AutoAugmentPolicy",
11 | "TorchAutoAugment",
12 | "TorchRandAugment",
13 | "TorchTrivialAugmentWide"
14 | ]
15 |
16 |
17 | # AUGMENTERS
18 |
19 | class TorchAutoAugment(TorchBuiltInTransformer):
20 | """
21 | Class that implements the AutoAugment Transformer from torchvision.
22 | The parameters one could pass are the same ones you would pass instantiating
23 | the transformer AutoAugment directly from torchvision.
24 |
25 | TorchVision documentation: [here](https://pytorch.org/vision/main/generated/torchvision.transforms.AutoAugment.html)
26 |
27 | NOTE: the augmented result will SUBSTITUTE the original input
28 | """
29 | def __init__(self, policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
30 | interpolation: InterpolationMode = InterpolationMode.NEAREST,
31 | fill: Optional[List[float]] = None):
32 |
33 | super().__init__(transforms.AutoAugment(policy, interpolation, fill))
34 |
35 |
36 | class TorchRandAugment(TorchBuiltInTransformer):
37 | """
38 | Class that implements the RandAugment Transformer from torchvision.
39 | The parameters one could pass are the same ones you would pass instantiating
40 | the transformer RandAugment directly from torchvision.
41 |
42 | TorchVision documentation: [here](https://pytorch.org/vision/main/generated/torchvision.transforms.RandAugment.html)
43 |
44 | NOTE: the augmented result will SUBSTITUTE the original input
45 | """
46 | def __init__(
47 | self,
48 | num_ops: int = 2,
49 | magnitude: int = 9,
50 | num_magnitude_bins: int = 31,
51 | interpolation: InterpolationMode = InterpolationMode.NEAREST,
52 | fill: Optional[List[float]] = None,
53 | ) -> None:
54 | super().__init__(transforms.RandAugment(num_ops, magnitude, num_magnitude_bins, interpolation, fill))
55 |
56 |
57 | class TorchTrivialAugmentWide(TorchBuiltInTransformer):
58 | """
59 | Class that implements the TrivialAugmentWide Transformer from torchvision.
60 | The parameters one could pass are the same ones you would pass instantiating
61 | the transformer TrivialAugmentWide directly from torchvision.
62 |
63 | TorchVision documentation: [here](https://pytorch.org/vision/main/generated/torchvision.transforms.TrivialAugmentWide.html)
64 |
65 | NOTE: the augmented result will SUBSTITUTE the original input
66 | """
67 | def __init__(
68 | self,
69 | num_magnitude_bins: int = 31,
70 | interpolation: InterpolationMode = InterpolationMode.NEAREST,
71 | fill: Optional[List[float]] = None,
72 | ) -> None:
73 | super().__init__(transforms.TrivialAugmentWide(num_magnitude_bins, interpolation, fill))
74 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/information_processor/information_processor_abstract.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List, Any
3 |
4 | import torch
5 |
6 |
7 | class InformationProcessor(ABC):
8 | """
9 | Abstract class that generalizes data processing.
10 | """
11 |
12 | @abstractmethod
13 | def process(self, field_data: Any):
14 | raise NotImplementedError
15 |
16 | @abstractmethod
17 | def __eq__(self, other):
18 | raise NotImplementedError
19 |
20 | @abstractmethod
21 | def __str__(self):
22 | raise NotImplementedError
23 |
24 | @abstractmethod
25 | def __repr__(self):
26 | raise NotImplementedError
27 |
28 |
29 | class ImageProcessor(InformationProcessor, torch.nn.Module):
30 | """
31 | Abstract class for image processing.
32 | """
33 | @abstractmethod
34 | def forward(self, field_data: torch.Tensor) -> torch.Tensor:
35 | raise NotImplementedError
36 |
37 | def process(self, field_data: torch.Tensor) -> torch.Tensor:
38 | return self.forward(field_data)
39 |
40 | def __eq__(self, other):
41 | return torch.nn.Module.__eq__(self, other)
42 |
43 |
44 | class AudioProcessor(InformationProcessor):
45 | """
46 | Abstract class for audio processing.
47 | """
48 | @abstractmethod
49 | def process(self, field_data):
50 | raise NotImplementedError
51 |
52 |
53 | class TextProcessor(InformationProcessor):
54 | """
55 | Abstract class for raw text processing.
56 | """
57 |
58 | @staticmethod
59 | def list_to_string(text: List[str]) -> str:
60 | """
61 | Convert list of str in str
62 | Args: text (str): list of str
63 | Returns: str sentence
64 | """
65 | string_text = ' '.join([str(elem) for elem in text])
66 | return string_text
67 |
68 | @staticmethod
69 | def string_to_list(text: str) -> List[str]:
70 | """
71 | Covert str in list of str
72 | Args:
73 | text (str): str sentence
74 |
75 | Returns List : List of words
76 | """
77 | list_text = list(text.split(" "))
78 | return list_text
79 |
80 | @abstractmethod
81 | def process(self, field_data: str):
82 | raise NotImplementedError
83 |
84 |
85 | class NLP(TextProcessor):
86 | """
87 | Class for processing a text via Natural Language Processing.
88 |
89 | """
90 |
91 | @abstractmethod
92 | def process(self, field_data: str) -> List[str]:
93 | """
94 | Apply on the original text the required preprocessing steps
95 | Args:
96 | field_data: text on which NLP with specified phases will be applied
97 |
98 | Returns:
99 | list: The text, after being processed with the specified NLP pipeline,
100 | is splitted in single words that are put into a list. The splitting is executed
101 | even if none of the preprocessing steps is computed.
102 | """
103 | raise NotImplementedError
104 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_learner/random_indexing.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import numpy as np
4 | import gensim
5 | from gensim.models import RpModel
6 | from gensim.corpora import Dictionary
7 |
8 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimDocumentEmbeddingLearner
9 | from clayrs.content_analyzer.utils.check_tokenization import check_tokenized
10 |
11 |
12 | class GensimRandomIndexing(GensimDocumentEmbeddingLearner):
13 | """
14 | Class that implements RandomIndexing model thanks to the Gensim library.
15 |
16 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter.
17 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly
18 | represent
19 |
20 | If you'd like to save the model once trained, set the path in the `reference` parameter and set
21 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to
22 | produce contents in the current run
23 |
24 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/rpmodel.html)
25 | to see what else can be customized
26 |
27 | Args:
28 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the
29 | trained model won't be saved after training and will only be used to produce contents in the current run
30 | auto_save: If True, the model will be saved in the path specified in `reference` parameter
31 | """
32 |
33 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs):
34 | super().__init__(reference, auto_save, ".model", **kwargs)
35 |
36 | def fit_model(self, corpus: List):
37 | dictionary = Dictionary(corpus)
38 | word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus]
39 | self.model = RpModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters)
40 |
41 | def load_model(self):
42 | return RpModel.load(self.reference)
43 |
44 | def get_vector_size(self) -> int:
45 | return self.model.num_topics
46 |
47 | def get_embedding(self, document_tokenized: List[str]) -> np.ndarray:
48 | unseen_doc = self.model.id2word.doc2bow(check_tokenized(document_tokenized))
49 |
50 | # if document is totally new (no word in train corpus) KeyError is raised
51 | # and load method of embedding source will fill the document vector with zeros
52 | if len(unseen_doc) == 0:
53 | raise KeyError
54 |
55 | sparse_vector = self.model[unseen_doc]
56 | dense_vector = gensim.matutils.sparse2full(sparse_vector, self.model.num_topics)
57 | return dense_vector
58 |
59 | def __str__(self):
60 | return "GensimRandomProjections"
61 |
62 | def __repr__(self):
63 | return f"GensimRandomProjections(reference={self.reference}, auto_save={self._auto_save}, " \
64 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})"
65 |
--------------------------------------------------------------------------------
/clayrs/content_analyzer/embeddings/embedding_learner/latent_semantic_analysis.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import gensim
4 | from gensim.corpora import Dictionary
5 | from gensim.models import LsiModel
6 |
7 | from clayrs.content_analyzer.embeddings.embedding_learner.embedding_learner import GensimDocumentEmbeddingLearner
8 | from clayrs.content_analyzer.utils.check_tokenization import check_tokenized
9 |
10 |
11 | class GensimLatentSemanticAnalysis(GensimDocumentEmbeddingLearner):
12 | """
13 | Class that implements Latent Semantic Analysis (A.K.A. Latent Semantic Indexing)
14 | (LSI) thanks to the Gensim library.
15 |
16 | If a pre-trained local Word2Vec model must be loaded, put its path in the `reference` parameter.
17 | Otherwise, a Word2Vec model will be trained from scratch based on the preprocessed corpus of the contents to complexly
18 | represent
19 |
20 | If you'd like to save the model once trained, set the path in the `reference` parameter and set
21 | `auto_save=True`. If `reference` is None, trained model won't be saved after training and will only be used to
22 | produce contents in the current run
23 |
24 | Additional parameters regarding the model itself could be passed, check [gensim documentation](https://radimrehurek.com/gensim/models/lsimodel.html)
25 | to see what else can be customized
26 |
27 | Args:
28 | reference: Path of the model to load/where the model trained will be saved if `auto_save=True`. If None the
29 | trained model won't be saved after training and will only be used to produce contents in the current run
30 | auto_save: If True, the model will be saved in the path specified in `reference` parameter
31 | """
32 |
33 | def __init__(self, reference: str = None, auto_save: bool = True, **kwargs):
34 | super().__init__(reference, auto_save, ".model", **kwargs)
35 |
36 | def fit_model(self, corpus: List):
37 | dictionary = Dictionary(corpus)
38 | word_docs_matrix = [dictionary.doc2bow(doc) for doc in corpus]
39 | self.model = LsiModel(word_docs_matrix, id2word=dictionary, **self.additional_parameters)
40 |
41 | def load_model(self):
42 | return LsiModel.load(self.reference)
43 |
44 | def get_vector_size(self) -> int:
45 | return self.model.num_topics
46 |
47 | def get_embedding(self, document_tokenized: List[str]):
48 | unseen_doc = self.model.id2word.doc2bow(check_tokenized(document_tokenized))
49 |
50 | # if document is totally new (no word in train corpus) KeyError is raised
51 | # and load method of embedding source will fill the document vector with zeros
52 | if len(unseen_doc) == 0:
53 | raise KeyError
54 |
55 | sparse_vector = self.model[unseen_doc]
56 | dense_vector = gensim.matutils.sparse2full(sparse_vector, self.model.num_topics)
57 | return dense_vector
58 |
59 | def __str__(self):
60 | return "GensimLatentSemanticAnalysis"
61 |
62 | def __repr__(self):
63 | return f"GensimLatentSemanticAnalysis(reference={self.reference}, auto_save={self._auto_save}, " \
64 | f"{', '.join(f'{arg}={val}' for arg, val in self._additional_parameters.items())})"
65 |
--------------------------------------------------------------------------------
/test/recsys/graph_based_algorithm/test_graph_based_algorithm.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import pandas as pd
3 |
4 | from clayrs.content_analyzer import Ratings
5 | from clayrs.recsys.graphs.graph import ItemNode, UserNode, PropertyNode
6 | from clayrs.recsys.graphs.nx_implementation.nx_full_graphs import NXFullGraph
7 |
8 | from clayrs.recsys.graph_based_algorithm.page_rank.nx_page_rank import NXPageRank
9 |
10 |
11 | class TestGraphBasedAlgorithm(TestCase):
12 |
13 | def setUp(self) -> None:
14 | ratings = pd.DataFrame.from_records([
15 | ("A000", "tt0114576", 1, "54654675"),
16 | ("A000", "tt0112453", -0.2, "54654675"),
17 | ("A001", "tt0114576", 0.8, "54654675"),
18 | ("A001", "tt0112896", -0.4, "54654675"),
19 | ("A000", "tt0113041", 0.6, "54654675"),
20 | ("A002", "tt0112453", -0.2, "54654675"),
21 | ("A002", "tt0113497", 0.5, "54654675"),
22 | ("A003", "tt0112453", -0.8, "54654675")],
23 | columns=["from_id", "to_id", "score", "timestamp"])
24 | ratings = Ratings.from_dataframe(ratings)
25 |
26 | self.graph = NXFullGraph(ratings)
27 |
28 | # GraphBasedAlgorithm is an abstract class, so we instantiate a subclass in order to test its methods
29 | self.alg = NXPageRank()
30 |
31 | def test_filter_result(self):
32 | rank = {UserNode("A000"): 0.5, ItemNode("tt0114576"): 0.5, UserNode("A001"): 0.5, ItemNode("tt0113497"): 0.5,
33 | ItemNode("tt0112453"): 0.5, PropertyNode("Nolan"): 0.5}
34 |
35 | # filter list with item i1, in this case graph parameter and user node parameter won't do anything
36 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=[ItemNode('tt0114576')],
37 | user_node=UserNode("A000"))
38 | expected = {ItemNode("tt0114576"): 0.5}
39 | self.assertEqual(expected, result)
40 |
41 | # filter list with item i1 and item i2, in this case graph parameter and user node parameter won't do anything
42 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=[ItemNode('tt0114576'),
43 | PropertyNode('Nolan')],
44 | user_node=UserNode("A000"))
45 | expected = {ItemNode('tt0114576'): 0.5, PropertyNode("Nolan"): 0.5}
46 | self.assertEqual(expected, result)
47 |
48 | # filter with non existent nodes, result will be empty
49 | # in this case graph parameter and user node parameter won't do anything
50 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=[ItemNode('non_existent')],
51 | user_node=UserNode("A000"))
52 | expected = {}
53 | self.assertEqual(expected, result)
54 |
55 | # clean result for user A000, the cleaned result will have only item nodes
56 | result = self.alg.filter_result(graph=self.graph, result=rank, filter_list=None,
57 | user_node=UserNode("A000"))
58 | expected = {ItemNode("tt0113497"): 0.5}
59 |
60 | self.assertEqual(expected, result)
61 |
--------------------------------------------------------------------------------
/test/recsys/graphs/test_graph.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import pandas as pd
4 |
5 | from clayrs.content_analyzer import Ratings
6 | from clayrs.recsys import NXFullGraph
7 | from test import dir_test_files
8 | from unittest import TestCase
9 |
10 | movies_dir = os.path.join(dir_test_files, 'complex_contents', 'movies_codified/')
11 | users_dir = os.path.join(dir_test_files, 'complex_contents', 'users_codified/')
12 |
13 | rat = pd.DataFrame.from_dict({'from_id': ["1", "1", "2", "2", "2", "3", "4", "4"],
14 | 'to_id': ["tt0112281", "tt0112302", "tt0112281", "tt0112346",
15 | "tt0112453", "tt0112453", "tt0112346", "tt0112453"],
16 | 'score': [0.8, 0.7, -0.4, 1.0, 0.4, 0.1, -0.3, 0.7]})
17 | rat = Ratings.from_dataframe(rat)
18 |
19 |
20 | class TestGraph(TestCase):
21 | def setUp(self) -> None:
22 | # we need to instantiate a sublass of Graph since it's an abstract class to test its methods
23 | self.g: NXFullGraph = NXFullGraph(rat,
24 | item_contents_dir=movies_dir,
25 | item_exo_properties={'dbpedia': ['film director',
26 | 'runtime (m)']},
27 |
28 | # It's the column in the users .DAT which identifies the gender
29 | user_exo_properties={'local': '1'},
30 | user_contents_dir=users_dir)
31 |
32 | def test_to_ratings(self):
33 | converted_rat = self.g.to_ratings()
34 |
35 | # check that original ratings and converted ratings are equal
36 | self.assertEqual(set(rat.unique_user_id_column), set(converted_rat.unique_user_id_column))
37 | self.assertEqual(set(rat.unique_item_id_column), set(converted_rat.unique_item_id_column))
38 | self.assertEqual(set(rat.score_column), set(converted_rat.score_column))
39 | self.assertEqual(set(rat.timestamp_column), set(converted_rat.timestamp_column))
40 |
41 | # compare that for each user, we have same user interactions
42 | for user in rat.user_id_column:
43 | user_rat = rat.get_user_interactions(user)
44 | user_converted_rat = converted_rat.get_user_interactions(user)
45 |
46 | self.assertCountEqual(user_rat, user_converted_rat)
47 |
48 | # user map set, so we expected same user map between expected and result
49 | converted_rat_with_user_map = self.g.to_ratings(user_map=rat.user_map)
50 | self.assertEqual(list(rat.user_map), list(converted_rat_with_user_map.user_map))
51 |
52 | # item map set, so we expected same item map between expected and result
53 | converted_rat_with_item_map = self.g.to_ratings(item_map=rat.item_map)
54 | self.assertEqual(list(rat.item_map), list(converted_rat_with_item_map.item_map))
55 |
56 | # user map and item_map set, so we expected them to be equal between expected and result
57 | converted_rat_with_user_item_map = self.g.to_ratings(user_map=rat.user_map, item_map=rat.item_map)
58 | self.assertEqual(list(rat.user_map), list(converted_rat_with_user_item_map.user_map))
59 | self.assertEqual(list(rat.item_map), list(converted_rat_with_user_item_map.item_map))
60 |
--------------------------------------------------------------------------------
/test/content_analyzer/content_representation/test_representation_container.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | import numpy as np
3 |
4 | from clayrs.content_analyzer.content_representation.representation_container import RepresentationContainer
5 |
6 |
7 | class TestRepresentationContainer(TestCase):
8 |
9 | def test_rep_container(self):
10 | rep_container = RepresentationContainer(['rep1', 'rep2', 'rep3'], ['test1', None, 'test3'])
11 |
12 | # tests to check that the indexes and columns of the dataframe in representation container are set as expected
13 | self.assertEqual([0, 1, 2], rep_container.get_internal_index())
14 | self.assertEqual(['test1', None, 'test3'], rep_container.get_external_index())
15 | self.assertEqual(['rep1', 'rep2', 'rep3'], rep_container.get_representations())
16 |
17 | # tests to check that the representation related to the internal_id or external_id passed to rep_container
18 | # is the appropriate representation
19 | for _ in range(15000):
20 | self.assertEqual('rep1', rep_container[0])
21 | self.assertEqual('rep3', rep_container['test3'])
22 |
23 | # tests to check the correct functionality of the append and remove method
24 | rep_container.append('rep4', 'test4')
25 | self.assertEqual('rep4', rep_container['test4'])
26 |
27 | value_removed = rep_container.pop('test4')
28 | self.assertEqual('rep4', value_removed)
29 | self.assertFalse('rep4' in rep_container.get_representations())
30 |
31 | # test for empty representation container
32 | empty_rep_container = RepresentationContainer()
33 | self.assertEqual(0, len(empty_rep_container))
34 |
35 | # test for passing single value to representation container constructor instead of lists
36 | single_rep_container = RepresentationContainer('rep', 'test')
37 | self.assertEqual('rep', single_rep_container['test'])
38 |
39 | # test exception different length of external_id representation lists when passed to the constructor
40 | with self.assertRaises(ValueError):
41 | RepresentationContainer(['rep1', 'rep2'], ['test1'])
42 |
43 | # test exception different length of external_id representation lists when passed to the 'append' method
44 | with self.assertRaises(ValueError):
45 | rep_container.append(['rep1', 'rep2'], ['test1'])
46 |
47 | # test exception representation not present
48 | with self.assertRaises(KeyError):
49 | err = rep_container['not_existent']
50 |
51 | def test_iter(self):
52 | rep_container = RepresentationContainer(['rep1', 'rep2', 'rep3'], ['test1', None, 'test3'])
53 |
54 | expected_list = [
55 | {'internal_id': 0, 'external_id': 'test1', 'representation': 'rep1'},
56 | {'internal_id': 1, 'external_id': None, 'representation': 'rep2'},
57 | {'internal_id': 2, 'external_id': 'test3', 'representation': 'rep3'}
58 | ]
59 |
60 | it = iter(rep_container)
61 |
62 | self.assertEqual(expected_list[0], next(it))
63 | self.assertEqual(expected_list[1], next(it))
64 | self.assertEqual(expected_list[2], next(it))
65 |
66 | # Check that the iterator gives an error since there aren't any items left
67 | with self.assertRaises(StopIteration):
68 | next(it)
69 |
--------------------------------------------------------------------------------
/test/test_files/movies_info_reduced.csv:
--------------------------------------------------------------------------------
1 | "Title","Year","Rated","Released","Runtime","Genre","Director","Writer","Actors","Plot","Language","Country","Awards","Poster","Metascore","imdbRating","imdbVotes","imdbID","Type","DVD","BoxOffice","Production","Website","Response"
2 | "Jumanji","1995","PG","15 Dec 1995","104 min","Adventure, Family, Fantasy","Joe Johnston","Jonathan Hensleigh (screenplay by), Greg Taylor (screenplay by), Jim Strain (screenplay by), Greg Taylor (screen story by), Jim Strain (screen story by), Chris Van Allsburg (screen story by), Chris Van Allsburg (based on the book by)","Robin Williams, Jonathan Hyde, Kirsten Dunst, Bradley Pierce","After being trapped in a jungle board game for 26 years, a Man-Child wins his release from the game. But, no sooner has he arrived that he is forced to play again, and this time sets the creatures of the jungle loose on the city. Now it is up to him to stop them.","English, French","USA","4 wins & 9 nominations.","https://m.media-amazon.com/images/M/MV5BZTk2ZmUwYmEtNTcwZS00YmMyLWFkYjMtNTRmZDA3YWExMjc2XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","39","6.9","260,909","tt0113497","movie","25 Jan 2000","N/A","Sony Pictures Home Entertainment","N/A","True"
3 | "Grumpier Old Men","1995","PG-13","22 Dec 1995","101 min","Comedy, Romance","Howard Deutch","Mark Steven Johnson (characters), Mark Steven Johnson","Walter Matthau, Jack Lemmon, Sophia Loren, Ann-Margret","Things don't seem to change much in Wabasha County: Max and John are still fighting after 35 years, Grandpa still drinks, smokes, and chases women , and nobody's been able to catch the fabled ""Catfish Hunter"", a gigantic catfish that actually smiles at fishermen who try to snare it. Six months ago John married the new girl in town (Ariel), and people begin to suspect that Max might be missing something similar in his life. The only joy Max claims is left in his life is fishing, but that might change with the new owner of the bait shop.","English, Italian, German","USA","2 wins & 2 nominations.","https://m.media-amazon.com/images/M/MV5BMjQxM2YyNjMtZjUxYy00OGYyLTg0MmQtNGE2YzNjYmUyZTY1XkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg","46","6.6","21,823","tt0113228","movie","18 Nov 1997","N/A","Warner Home Video","N/A","True"
4 | "Toy Story","1995","G","22 Nov 1995","81 min","Animation, Adventure, Comedy, Family, Fantasy","John Lasseter","John Lasseter (original story by), Pete Docter (original story by), Andrew Stanton (original story by), Joe Ranft (original story by), Joss Whedon (screenplay by), Andrew Stanton (screenplay by), Joel Cohen (screenplay by), Alec Sokolow (screenplay by)","Tom Hanks, Tim Allen, Don Rickles, Jim Varney","A little boy named Andy loves to be in his room, playing with his toys, especially his doll named ""Woody"". But, what do the toys do when Andy is not with them, they come to life. Woody believes that he has life (as a toy) good. However, he must worry about Andy's family moving, and what Woody does not know is about Andy's birthday party. Woody does not realize that Andy's mother gave him an action figure known as Buzz Lightyear, who does not believe that he is a toy, and quickly becomes Andy's new favorite toy. Woody, who is now consumed with jealousy, tries to get rid of Buzz. Then, both Woody and Buzz are now lost. They must find a way to get back to Andy before he moves without them, but they will have to pass through a ruthless toy killer, Sid Phillips.","English","USA","Nominated for 3 Oscars. Another 23 wins & 17 nominations.","https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_SX300.jpg","95","8.3","761,649","tt0114709","movie","20 Mar 2001","N/A","Buena Vista","http://www.disney.com/ToyStory","True"
5 |
--------------------------------------------------------------------------------