├── .gitignore ├── LICENSE ├── README.md ├── azure-pipelines.yml ├── environment.yml ├── exercises ├── setup.py └── tmclass_exercises │ ├── __init__.py │ ├── data │ ├── __init__.py │ └── poetry │ │ ├── __init__.py │ │ ├── basho.txt │ │ ├── baudelaire.txt │ │ ├── metadata.json │ │ ├── rumi.txt │ │ ├── shakespeare.txt │ │ └── verlaine.txt │ ├── data_download.py │ ├── encoding.py │ ├── indexing.py │ ├── language_detector.py │ ├── pretrained_models │ └── __init__.py │ ├── scraping.py │ ├── tests │ ├── __init__.py │ ├── test_encoding.py │ ├── test_indexing.py │ ├── test_language_detector.py │ ├── test_scraping.py │ └── test_text_manipulation.py │ └── text_manipulation.py ├── notebooks ├── BBC_News_Dataset_part_1_encoding.ipynb ├── BBC_News_Dataset_part_2.ipynb ├── BBC_News_Dataset_part_3.ipynb ├── BBC_News_Dataset_part_4.ipynb ├── Scraping_with_webdriver.ipynb └── notebook_solutions │ ├── build_preprocessor.py │ ├── build_tokenizer.py │ ├── cluster_weights.py │ ├── extract_book_bestsellers.py │ ├── extract_bookname.py │ ├── extract_price.py │ ├── extract_product_id.py │ ├── homogeneity_vs_completeness.py │ ├── scrape_best_sellers.py │ ├── tfidf_similarities.py │ └── top_idf.py ├── requirements.txt ├── setup.cfg ├── solutions ├── setup.py └── tmclass_solutions │ ├── __init__.py │ ├── data │ ├── __init__.py │ └── poetry │ │ ├── __init__.py │ │ ├── basho.txt │ │ ├── baudelaire.txt │ │ ├── metadata.json │ │ ├── rumi.txt │ │ ├── shakespeare.txt │ │ └── verlaine.txt │ ├── data_download.py │ ├── encoding.py │ ├── indexing.py │ ├── language_detector.py │ ├── pretrained_models │ └── __init__.py │ ├── scraping.py │ ├── tests │ ├── __init__.py │ ├── test_encoding.py │ ├── test_indexing.py │ ├── test_language_detector.py │ ├── test_scraping.py │ └── test_text_manipulation.py │ └── text_manipulation.py └── tools ├── ci └── requirements.txt ├── scrape_language_dataset.py ├── synchronize_exercises.py └── train_language_classifier.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/README.md -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/azure-pipelines.yml -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/environment.yml -------------------------------------------------------------------------------- /exercises/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/setup.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/__init__.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/poetry/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/poetry/basho.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/data/poetry/basho.txt -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/poetry/baudelaire.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/data/poetry/baudelaire.txt -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/poetry/metadata.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/data/poetry/metadata.json -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/poetry/rumi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/data/poetry/rumi.txt -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/poetry/shakespeare.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/data/poetry/shakespeare.txt -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data/poetry/verlaine.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/data/poetry/verlaine.txt -------------------------------------------------------------------------------- /exercises/tmclass_exercises/data_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/data_download.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/encoding.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/indexing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/indexing.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/language_detector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/language_detector.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/pretrained_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /exercises/tmclass_exercises/scraping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/scraping.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /exercises/tmclass_exercises/tests/test_encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/tests/test_encoding.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/tests/test_indexing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/tests/test_indexing.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/tests/test_language_detector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/tests/test_language_detector.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/tests/test_scraping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/tests/test_scraping.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/tests/test_text_manipulation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/tests/test_text_manipulation.py -------------------------------------------------------------------------------- /exercises/tmclass_exercises/text_manipulation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/exercises/tmclass_exercises/text_manipulation.py -------------------------------------------------------------------------------- /notebooks/BBC_News_Dataset_part_1_encoding.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/BBC_News_Dataset_part_1_encoding.ipynb -------------------------------------------------------------------------------- /notebooks/BBC_News_Dataset_part_2.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/BBC_News_Dataset_part_2.ipynb -------------------------------------------------------------------------------- /notebooks/BBC_News_Dataset_part_3.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/BBC_News_Dataset_part_3.ipynb -------------------------------------------------------------------------------- /notebooks/BBC_News_Dataset_part_4.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/BBC_News_Dataset_part_4.ipynb -------------------------------------------------------------------------------- /notebooks/Scraping_with_webdriver.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/Scraping_with_webdriver.ipynb -------------------------------------------------------------------------------- /notebooks/notebook_solutions/build_preprocessor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/build_preprocessor.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/build_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/build_tokenizer.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/cluster_weights.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/cluster_weights.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/extract_book_bestsellers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/extract_book_bestsellers.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/extract_bookname.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/extract_bookname.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/extract_price.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/extract_price.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/extract_product_id.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/extract_product_id.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/homogeneity_vs_completeness.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/homogeneity_vs_completeness.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/scrape_best_sellers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/scrape_best_sellers.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/tfidf_similarities.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/tfidf_similarities.py -------------------------------------------------------------------------------- /notebooks/notebook_solutions/top_idf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/notebooks/notebook_solutions/top_idf.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/requirements.txt -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/setup.cfg -------------------------------------------------------------------------------- /solutions/setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/setup.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/__init__.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/poetry/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/poetry/basho.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/data/poetry/basho.txt -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/poetry/baudelaire.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/data/poetry/baudelaire.txt -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/poetry/metadata.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/data/poetry/metadata.json -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/poetry/rumi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/data/poetry/rumi.txt -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/poetry/shakespeare.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/data/poetry/shakespeare.txt -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data/poetry/verlaine.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/data/poetry/verlaine.txt -------------------------------------------------------------------------------- /solutions/tmclass_solutions/data_download.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/data_download.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/encoding.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/indexing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/indexing.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/language_detector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/language_detector.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/pretrained_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /solutions/tmclass_solutions/scraping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/scraping.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /solutions/tmclass_solutions/tests/test_encoding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/tests/test_encoding.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/tests/test_indexing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/tests/test_indexing.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/tests/test_language_detector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/tests/test_language_detector.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/tests/test_scraping.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/tests/test_scraping.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/tests/test_text_manipulation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/tests/test_text_manipulation.py -------------------------------------------------------------------------------- /solutions/tmclass_solutions/text_manipulation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/solutions/tmclass_solutions/text_manipulation.py -------------------------------------------------------------------------------- /tools/ci/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/tools/ci/requirements.txt -------------------------------------------------------------------------------- /tools/scrape_language_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/tools/scrape_language_dataset.py -------------------------------------------------------------------------------- /tools/synchronize_exercises.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/tools/synchronize_exercises.py -------------------------------------------------------------------------------- /tools/train_language_classifier.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/text-mining-class/HEAD/tools/train_language_classifier.py --------------------------------------------------------------------------------