├── talks ├── pyConLT │ └── img ├── pyData │ ├── img │ │ ├── logo.png │ │ ├── notebook.png │ │ ├── poc_base.jpg │ │ ├── poc_dev1.png │ │ ├── poc_dev2.png │ │ ├── poc_dev3.png │ │ ├── poc_dev4.png │ │ ├── poc_ml1.png │ │ ├── poc_ml2.png │ │ ├── poc_ml3.png │ │ ├── poc_ml4.png │ │ ├── results.png │ │ ├── roc_curve.png │ │ ├── GitHub-logo.png │ │ ├── dvc │ │ │ ├── dvc_cmd.png │ │ │ ├── gen_dvc.png │ │ │ ├── dvc_home_page.png │ │ │ ├── dvc_home_page1.png │ │ │ ├── dvc_home_page2.png │ │ │ ├── dvc_home_page3.png │ │ │ ├── pipeline │ │ │ │ ├── DVC1.png │ │ │ │ ├── DVC2.png │ │ │ │ ├── DVC3.png │ │ │ │ ├── DVC4.png │ │ │ │ ├── DVC5.png │ │ │ │ ├── DVC6.png │ │ │ │ ├── DVC7.png │ │ │ │ ├── DVC8.png │ │ │ │ ├── DVC9.png │ │ │ │ ├── DVC_change0.png │ │ │ │ ├── DVC_change1.png │ │ │ │ ├── DVC_change2.png │ │ │ │ ├── DVC_change3.png │ │ │ │ ├── DVC_change4.png │ │ │ │ ├── DVC_change5.png │ │ │ │ └── DVC_change2bis.png │ │ │ └── script_docstring_extract.png │ │ ├── icons │ │ │ ├── gear.png │ │ │ ├── database.png │ │ │ ├── youtube.png │ │ │ ├── analytics.png │ │ │ ├── parameters.png │ │ │ ├── parameters_blue.png │ │ │ └── parameters_grey.png │ │ ├── poc_worst1.png │ │ ├── poc_worst2.png │ │ ├── poc_worst3.png │ │ ├── crying_unicorn.png │ │ ├── global_schema1.png │ │ ├── global_schema2.png │ │ ├── confusion_matrix.png │ │ ├── mlv_convert │ │ │ ├── cmd.png │ │ │ ├── script1.png │ │ │ ├── script2.png │ │ │ ├── script3.png │ │ │ ├── script4.png │ │ │ ├── cmd_param.png │ │ │ └── nb_docstring.png │ │ ├── nb_convert │ │ │ ├── script.png │ │ │ ├── formated_script.png │ │ │ ├── formated_script_no_effect.png │ │ │ └── formated_script_not_conf.png │ │ ├── nb_convert_script.png │ │ └── nb_docstring_extract.png │ ├── overview.md │ └── draft.md ├── workshop │ └── img │ │ ├── logo.png │ │ ├── GitHub-logo.png │ │ ├── icons │ │ └── youtube.png │ │ └── dvc │ │ ├── pipeline │ │ ├── DVC1.png │ │ ├── DVC2.png │ │ ├── DVC3.png │ │ ├── DVC4.png │ │ ├── DVC5.png │ │ ├── DVC9.png │ │ ├── DVC_change0.png │ │ ├── DVC_change1.png │ │ ├── DVC_change2.png │ │ ├── DVC_change3.png │ │ ├── DVC_change4.png │ │ ├── DVC_change5.png │ │ └── DVC_change2bis.png │ │ ├── dvc_home_page1.png │ │ ├── dvc_home_page2.png │ │ └── dvc_home_page3.png └── reveal.js │ ├── lib │ ├── font │ │ └── external_fonts │ │ │ ├── Capsuula.woff │ │ │ ├── Capsuula.woff2 │ │ │ ├── WhiteRabbit.woff │ │ │ ├── WhiteRabbit.woff2 │ │ │ └── stylesheet.css │ ├── js │ │ ├── html5shiv.js │ │ └── classList.js │ └── css │ │ └── zenburn.css │ ├── plugin │ ├── multiplex │ │ ├── client.js │ │ ├── package.json │ │ ├── master.js │ │ └── index.js │ ├── markdown │ │ ├── example.md │ │ └── example.html │ ├── external │ │ ├── bower.json │ │ ├── LICENSE │ │ ├── external │ │ │ └── external.js │ │ └── README.md │ ├── math │ │ └── math.js │ ├── notes-server │ │ ├── index.js │ │ └── client.js │ └── print-pdf │ │ └── print-pdf.js │ ├── bower.json │ ├── CONTRIBUTING.md │ ├── css │ ├── theme │ │ ├── source │ │ │ ├── serif.scss │ │ │ ├── simple.scss │ │ │ └── moon.scss │ │ ├── template │ │ │ ├── settings.scss │ │ │ └── mixins.scss │ │ └── README.md │ └── print │ │ └── pdf.css │ ├── LICENSE │ ├── package.json │ ├── index.html │ └── Gruntfile.js ├── resources ├── setup_project │ ├── project │ │ ├── classifier │ │ │ ├── __init__.py │ │ │ ├── split.py │ │ │ ├── pre_process.py │ │ │ ├── helper.py │ │ │ └── extract.py │ │ ├── requirements.txt │ │ ├── .gitignore │ │ ├── setup.py │ │ ├── Makefile │ │ └── notebooks │ │ │ ├── evaluate_model.ipynb │ │ │ ├── extract_data.ipynb │ │ │ ├── preprocess_data.ipynb │ │ │ ├── train_data_model.ipynb │ │ │ └── split_dataset.ipynb │ ├── data │ │ └── input │ │ │ └── conf.json │ ├── docker │ │ ├── run.sh │ │ └── Dockerfile │ └── solution │ │ ├── configurables │ │ ├── evaluate_model.ipynb │ │ ├── extract_data.ipynb │ │ ├── preprocess_data.ipynb │ │ ├── train_data_model.ipynb │ │ └── split_dataset.ipynb │ │ └── mlvtools │ │ ├── evaluate_model.ipynb │ │ ├── extract_data.ipynb │ │ ├── preprocess_data.ipynb │ │ ├── train_data_model.ipynb │ │ └── split_dataset.ipynb ├── dvc_playground │ ├── user │ │ ├── resources │ │ │ ├── inputs │ │ │ │ ├── parameters.json │ │ │ │ ├── part2.input │ │ │ │ └── part1.input │ │ │ └── steps │ │ │ │ ├── concat_files.py │ │ │ │ └── decrypt.py │ │ ├── dvc_init_repo.sh │ │ ├── Dockerfile │ │ └── private_key │ ├── remote_git │ │ ├── pub_key │ │ └── Dockerfile │ ├── remote_dvc │ │ ├── pub_key │ │ └── Dockerfile │ └── docker-compose.yml ├── dummy │ ├── step4_convert_octals.ipynb │ ├── step1_sanitize_data.ipynb │ ├── dummy_pipeline_feed_2.txt │ ├── dummy_pipeline_feed.txt │ ├── dummy_pipeline_feed_3.txt │ ├── step3_convert_binaries.ipynb │ └── step2_split_data.ipynb ├── 04_Evaluate_model.ipynb ├── 03_Classify_text.ipynb ├── 03_bis_Classify_text.ipynb ├── 02_Tokenize_text.ipynb └── 05_Tune_hyperparameters_with_crossvalidation.ipynb ├── .github └── CODEOWNERS ├── .gitignore ├── requirements.txt ├── tutorial ├── img │ └── setup_project_pipeline.png ├── use_case4.md ├── setup.md └── dvc_overview.md ├── setup.py ├── download_data.py ├── setup.cfg ├── requirements.yml ├── modify_input_data.py └── LICENSE /talks/pyConLT/img: -------------------------------------------------------------------------------- 1 | ../pyData/img -------------------------------------------------------------------------------- /resources/setup_project/project/classifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @SdgJlbl @hsmett @alexdashkov @elemoine -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | **/.ipynb_checkpoints/* 3 | __pycache__/ 4 | data/* 5 | 6 | -------------------------------------------------------------------------------- /resources/setup_project/data/input/conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "epoch": 20, 3 | "learning_rate": 0.7 4 | } -------------------------------------------------------------------------------- /resources/setup_project/project/requirements.txt: -------------------------------------------------------------------------------- 1 | fasttext 2 | jupyter 3 | dvc 4 | ml-versioning-tools 5 | nltk -------------------------------------------------------------------------------- /talks/pyData/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/logo.png -------------------------------------------------------------------------------- /talks/workshop/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/logo.png -------------------------------------------------------------------------------- /talks/pyData/img/notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/notebook.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_base.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_base.jpg -------------------------------------------------------------------------------- /talks/pyData/img/poc_dev1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_dev1.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_dev2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_dev2.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_dev3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_dev3.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_dev4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_dev4.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_ml1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_ml1.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_ml2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_ml2.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_ml3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_ml3.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_ml4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_ml4.png -------------------------------------------------------------------------------- /talks/pyData/img/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/results.png -------------------------------------------------------------------------------- /talks/pyData/img/roc_curve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/roc_curve.png -------------------------------------------------------------------------------- /talks/pyData/img/GitHub-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/GitHub-logo.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/dvc_cmd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/dvc_cmd.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/gen_dvc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/gen_dvc.png -------------------------------------------------------------------------------- /talks/pyData/img/icons/gear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/icons/gear.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_worst1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_worst1.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_worst2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_worst2.png -------------------------------------------------------------------------------- /talks/pyData/img/poc_worst3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/poc_worst3.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | dvc 3 | mlflow 4 | jupyter 5 | pandas 6 | numpy 7 | nltk 8 | pyfasttext 9 | mlvtools 10 | -------------------------------------------------------------------------------- /resources/setup_project/project/.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | *.egg-info 3 | __pycache__/ 4 | **.ipynb_checkpoints 5 | *.pytest_cache 6 | *.idea -------------------------------------------------------------------------------- /talks/pyData/img/crying_unicorn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/crying_unicorn.png -------------------------------------------------------------------------------- /talks/pyData/img/global_schema1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/global_schema1.png -------------------------------------------------------------------------------- /talks/pyData/img/global_schema2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/global_schema2.png -------------------------------------------------------------------------------- /talks/pyData/img/icons/database.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/icons/database.png -------------------------------------------------------------------------------- /talks/pyData/img/icons/youtube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/icons/youtube.png -------------------------------------------------------------------------------- /talks/workshop/img/GitHub-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/GitHub-logo.png -------------------------------------------------------------------------------- /talks/pyData/img/confusion_matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/confusion_matrix.png -------------------------------------------------------------------------------- /talks/pyData/img/icons/analytics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/icons/analytics.png -------------------------------------------------------------------------------- /talks/pyData/img/icons/parameters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/icons/parameters.png -------------------------------------------------------------------------------- /talks/pyData/img/mlv_convert/cmd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/mlv_convert/cmd.png -------------------------------------------------------------------------------- /talks/workshop/img/icons/youtube.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/icons/youtube.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/dvc_home_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/dvc_home_page.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/dvc_home_page1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/dvc_home_page1.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/dvc_home_page2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/dvc_home_page2.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/dvc_home_page3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/dvc_home_page3.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC1.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC2.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC3.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC4.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC5.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC6.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC7.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC8.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC9.png -------------------------------------------------------------------------------- /talks/pyData/img/mlv_convert/script1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/mlv_convert/script1.png -------------------------------------------------------------------------------- /talks/pyData/img/mlv_convert/script2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/mlv_convert/script2.png -------------------------------------------------------------------------------- /talks/pyData/img/mlv_convert/script3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/mlv_convert/script3.png -------------------------------------------------------------------------------- /talks/pyData/img/mlv_convert/script4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/mlv_convert/script4.png -------------------------------------------------------------------------------- /talks/pyData/img/nb_convert/script.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/nb_convert/script.png -------------------------------------------------------------------------------- /talks/pyData/img/nb_convert_script.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/nb_convert_script.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC1.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC2.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC3.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC4.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC5.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC9.png -------------------------------------------------------------------------------- /tutorial/img/setup_project_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/tutorial/img/setup_project_pipeline.png -------------------------------------------------------------------------------- /resources/dvc_playground/user/resources/inputs/parameters.json: -------------------------------------------------------------------------------- 1 | { 2 | "row_shift": 47, 3 | "col_even_shift": 17, 4 | "col_odd_shift": 65 5 | } -------------------------------------------------------------------------------- /talks/pyData/img/icons/parameters_blue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/icons/parameters_blue.png -------------------------------------------------------------------------------- /talks/pyData/img/icons/parameters_grey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/icons/parameters_grey.png -------------------------------------------------------------------------------- /talks/pyData/img/mlv_convert/cmd_param.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/mlv_convert/cmd_param.png -------------------------------------------------------------------------------- /talks/pyData/img/nb_docstring_extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/nb_docstring_extract.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/dvc_home_page1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/dvc_home_page1.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/dvc_home_page2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/dvc_home_page2.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/dvc_home_page3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/dvc_home_page3.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC_change0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC_change0.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC_change1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC_change1.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC_change2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC_change2.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC_change3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC_change3.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC_change4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC_change4.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC_change5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC_change5.png -------------------------------------------------------------------------------- /talks/pyData/img/mlv_convert/nb_docstring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/mlv_convert/nb_docstring.png -------------------------------------------------------------------------------- /talks/pyData/img/nb_convert/formated_script.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/nb_convert/formated_script.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC_change0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC_change0.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC_change1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC_change1.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC_change2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC_change2.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC_change3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC_change3.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC_change4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC_change4.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC_change5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC_change5.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/pipeline/DVC_change2bis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/pipeline/DVC_change2bis.png -------------------------------------------------------------------------------- /talks/pyData/img/dvc/script_docstring_extract.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/dvc/script_docstring_extract.png -------------------------------------------------------------------------------- /talks/workshop/img/dvc/pipeline/DVC_change2bis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/workshop/img/dvc/pipeline/DVC_change2bis.png -------------------------------------------------------------------------------- /talks/reveal.js/lib/font/external_fonts/Capsuula.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/reveal.js/lib/font/external_fonts/Capsuula.woff -------------------------------------------------------------------------------- /talks/reveal.js/lib/font/external_fonts/Capsuula.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/reveal.js/lib/font/external_fonts/Capsuula.woff2 -------------------------------------------------------------------------------- /talks/pyData/img/nb_convert/formated_script_no_effect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/nb_convert/formated_script_no_effect.png -------------------------------------------------------------------------------- /talks/pyData/img/nb_convert/formated_script_not_conf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/pyData/img/nb_convert/formated_script_not_conf.png -------------------------------------------------------------------------------- /talks/reveal.js/lib/font/external_fonts/WhiteRabbit.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/reveal.js/lib/font/external_fonts/WhiteRabbit.woff -------------------------------------------------------------------------------- /talks/reveal.js/lib/font/external_fonts/WhiteRabbit.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peopledoc/mlvtools-tutorial/HEAD/talks/reveal.js/lib/font/external_fonts/WhiteRabbit.woff2 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Python packaging.""" 4 | 5 | from setuptools import setup 6 | 7 | if __name__ == '__main__': 8 | setup() 9 | -------------------------------------------------------------------------------- /resources/setup_project/docker/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | IMG_NAME="setup_project_tuto" 3 | 4 | docker build -t $IMG_NAME $(dirname $0) 5 | 6 | docker run -v $(git rev-parse --show-toplevel):/tuto -it $IMG_NAME bash -------------------------------------------------------------------------------- /resources/setup_project/project/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """Python packaging.""" 4 | 5 | from setuptools import setup 6 | 7 | if __name__ == '__main__': 8 | setup(name='tuto_project') 9 | -------------------------------------------------------------------------------- /talks/reveal.js/lib/js/html5shiv.js: -------------------------------------------------------------------------------- 1 | document.createElement('header'); 2 | document.createElement('nav'); 3 | document.createElement('section'); 4 | document.createElement('article'); 5 | document.createElement('aside'); 6 | document.createElement('footer'); 7 | document.createElement('hgroup'); -------------------------------------------------------------------------------- /resources/dvc_playground/user/dvc_init_repo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | pushd ~ 4 | git config --global user.name $(whoami) 5 | git config --global user.email $(whoami)@example.com 6 | 7 | git clone git@git_srv:/srv/git/test_dvc_remote.git 8 | popd 9 | 10 | 11 | dvc remote add dvc_remote ssh://dvc_user@dvc_srv:/data/dvc/remote 12 | dvc config core.remote dvc_remote 13 | tail -f /dev/null 14 | -------------------------------------------------------------------------------- /resources/dvc_playground/remote_git/pub_key: -------------------------------------------------------------------------------- 1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC2LtCP20Y3Dxy1I7lVB98PqoUuFS7ggeC3pHC32r8e8Vfwhj73TVSHPvdUQDNLWXt+NuZpdSFrpWoT6l1YMHyRZArC1QiM0t+4ptt8Fr1baupnXHO3I74gp89+XEMdDjqSR9WZ0MIX1KRV956samUuEdHmEMTw22HaUQBCrk1b2P9J7e5AchNXJWMWITq9Rorzg58Pquj3ejENKlotAAyVzTAwrnfUuTlmTC96GXarJp4Pkx4LlWJv4J18XfrBRfGUD5F23IBMiII9fQtxrQZ3ntb3TALGCFmxs8udT0eXefsej10iAvrllP5Qg70fNKHpD31qn570AirbT+4FEq91 2 | -------------------------------------------------------------------------------- /resources/setup_project/project/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help setup 2 | 3 | #: help - Display callable targets. 4 | help: 5 | @echo "Reference card for usual actions in development environment." 6 | @echo "Here are available targets:" 7 | @egrep -o "^#: (.+)" [Mm]akefile | sed 's/#: /* /' 8 | 9 | 10 | #: setup - Install dependencies. 11 | setup: 12 | pip install cython 13 | pip install -e . -r ./requirements.txt -------------------------------------------------------------------------------- /resources/dvc_playground/remote_dvc/pub_key: -------------------------------------------------------------------------------- 1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC2LtCP20Y3Dxy1I7lVB98PqoUuFS7ggeC3pHC32r8e8Vfwhj73TVSHPvdUQDNLWXt+NuZpdSFrpWoT6l1YMHyRZArC1QiM0t+4ptt8Fr1baupnXHO3I74gp89+XEMdDjqSR9WZ0MIX1KRV956samUuEdHmEMTw22HaUQBCrk1b2P9J7e5AchNXJWMWITq9Rorzg58Pquj3ejENKlotAAyVzTAwrnfUuTlmTC96GXarJp4Pkx4LlWJv4J18XfrBRfGUD5F23IBMiII9fQtxrQZ3ntb3TALGCFmxs8udT0eXefsej10iAvrllP5Qg70fNKHpD31qn570AirbT+4FEq91 sbracaloni@poney 2 | -------------------------------------------------------------------------------- /download_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from os.path import dirname, join 3 | from tempfile import mkdtemp 4 | 5 | from sklearn.datasets.twenty_newsgroups import download_20newsgroups 6 | 7 | cache_path = join(dirname(__file__), 'poc', 'data', '20news-bydate_py3.pkz') 8 | 9 | tmp = mkdtemp() 10 | # Temporary directory is removed by download_20newsgroups 11 | buffer = download_20newsgroups(target_dir=tmp, cache_path=cache_path) 12 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/multiplex/client.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | var multiplex = Reveal.getConfig().multiplex; 3 | var socketId = multiplex.id; 4 | var socket = io.connect(multiplex.url); 5 | 6 | socket.on(multiplex.id, function(data) { 7 | // ignore data from sockets that aren't ours 8 | if (data.socketId !== socketId) { return; } 9 | if( window.location.host === 'localhost:1947' ) return; 10 | 11 | Reveal.setState(data.state); 12 | }); 13 | }()); 14 | -------------------------------------------------------------------------------- /resources/setup_project/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN apt-get update && apt-get install -y tree \ 4 | nano \ 5 | vim \ 6 | virtualenv \ 7 | python3-dev 8 | 9 | RUN git config --global user.name tuto_user 10 | RUN git config --global user.email tuto_user@example.com 11 | 12 | WORKDIR /tuto -------------------------------------------------------------------------------- /talks/reveal.js/lib/font/external_fonts/stylesheet.css: -------------------------------------------------------------------------------- 1 | @font-face { 2 | font-family: 'White Rabbit'; 3 | src: url('WhiteRabbit.woff2') format('woff2'), 4 | url('WhiteRabbit.woff') format('woff'); 5 | font-weight: normal; 6 | font-style: normal; 7 | } 8 | 9 | @font-face { 10 | font-family: 'Capsuula'; 11 | src: url('Capsuula.woff2') format('woff2'), 12 | url('Capsuula.woff') format('woff'); 13 | font-weight: normal; 14 | font-style: normal; 15 | } 16 | 17 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/multiplex/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "reveal-js-multiplex", 3 | "version": "1.0.0", 4 | "description": "reveal.js multiplex server", 5 | "homepage": "http://revealjs.com", 6 | "scripts": { 7 | "start": "node index.js" 8 | }, 9 | "engines": { 10 | "node": "~4.1.1" 11 | }, 12 | "dependencies": { 13 | "express": "~4.13.3", 14 | "grunt-cli": "~0.1.13", 15 | "mustache": "~2.2.1", 16 | "socket.io": "~1.3.7" 17 | }, 18 | "license": "MIT" 19 | } 20 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/markdown/example.md: -------------------------------------------------------------------------------- 1 | # Markdown Demo 2 | 3 | 4 | 5 | ## External 1.1 6 | 7 | Content 1.1 8 | 9 | Note: This will only appear in the speaker notes window. 10 | 11 | 12 | ## External 1.2 13 | 14 | Content 1.2 15 | 16 | 17 | 18 | ## External 2 19 | 20 | Content 2.1 21 | 22 | 23 | 24 | ## External 3.1 25 | 26 | Content 3.1 27 | 28 | 29 | ## External 3.2 30 | 31 | Content 3.2 32 | 33 | 34 | ## External 3.3 35 | 36 | ![External Image](https://s3.amazonaws.com/static.slid.es/logo/v2/slides-symbol-512x512.png) 37 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name=mlv-tools-tutorial 3 | version=0.1 4 | license_file = LICENSE 5 | description = A POC to link them all (DVC, MLflow, MLV-tools). 6 | long_description = file: README.md 7 | long_description_content_type = text/markdown 8 | author = PeopleDoc 9 | author_email = sarah.diot-girard@people-doc.com 10 | url = http://github.com/peopledoc/mlv-tools-tutorial 11 | keywords = peopledoc, machine learning, versioning, automate, MLV-tools, DVC, MLflow 12 | classifiers = 13 | Programming Language :: Python :: 3 14 | python_requires = >=3.6 15 | -------------------------------------------------------------------------------- /talks/pyData/overview.md: -------------------------------------------------------------------------------- 1 | - About-US (DUO) 2 | - Intro => DS point of view / SE poview (DUO) 3 | - Poc vs Prod ... vs DS vs SE 4 | - Jupyter Notebooks 5 | - Opposition 6 | - Joke 7 | - NB convert 8 | - MLV tools 9 | - Benefit of MLVtools 10 | 11 | - 2 Months Later (repro) 12 | - PBL 13 | - Git 14 | - Git Lfs 15 | - Pipeline = data x code ... 16 | - 17 | - DVC 18 | - what is it? 19 | - how it works 20 | - example 21 | - benefits vs relou 22 | - MLVtools gen_dvc 23 | - MLVtools ipynb_to_dvc 24 | 25 | -REX -------------------------------------------------------------------------------- /resources/dvc_playground/remote_git/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch 2 | RUN apt-get update && \ 3 | apt-get -y install openssh-server git vim && \ 4 | mkdir -p /var/run/sshd 5 | 6 | 7 | RUN useradd -d /home/git -m -s /bin/bash git 8 | 9 | 10 | RUN mkdir /home/git/.ssh && chmod 700 /home/git/.ssh 11 | COPY pub_key /home/git/.ssh/authorized_keys 12 | RUN chmod 600 /home/git/.ssh/authorized_keys 13 | RUN chown git: -R /home/git/.ssh 14 | RUN mkdir -p /srv/git/test_dvc_remote.git 15 | RUN cd /srv/git/test_dvc_remote.git/ && git init --bare 16 | RUN chown git: /srv/git/ -R 17 | 18 | 19 | CMD ["/usr/sbin/sshd", "-D"] -------------------------------------------------------------------------------- /talks/reveal.js/bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "reveal.js", 3 | "version": "3.6.0", 4 | "main": [ 5 | "js/reveal.js", 6 | "css/reveal.css" 7 | ], 8 | "homepage": "http://revealjs.com", 9 | "license": "MIT", 10 | "description": "The HTML Presentation Framework", 11 | "authors": [ 12 | "Hakim El Hattab " 13 | ], 14 | "dependencies": { 15 | "headjs": "~1.0.3" 16 | }, 17 | "repository": { 18 | "type": "git", 19 | "url": "git://github.com/hakimel/reveal.js.git" 20 | }, 21 | "ignore": [ 22 | "**/.*", 23 | "node_modules", 24 | "bower_components", 25 | "test" 26 | ] 27 | } -------------------------------------------------------------------------------- /resources/dvc_playground/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | remote_dvc_repo: 4 | build: 5 | context: ./remote_dvc 6 | ports: 7 | - "22:22" 8 | 9 | remote_git_repo: 10 | build: 11 | context: ./remote_git 12 | 13 | user1: 14 | build: 15 | context: ./user 16 | args: 17 | USER_NAME: "songoku" 18 | command: tail -f /dev/null 19 | links: 20 | - remote_dvc_repo:dvc_srv 21 | - remote_git_repo:git_srv 22 | user2: 23 | build: 24 | context: ./user 25 | args: 26 | USER_NAME: "bulma" 27 | command: tail -f /dev/null 28 | links: 29 | - remote_dvc_repo:dvc_srv 30 | - remote_git_repo:git_srv 31 | -------------------------------------------------------------------------------- /requirements.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - defaults 3 | dependencies: 4 | - numpy-base=1.15.1 5 | - cython=0.28 6 | - ipython=6.1.0 7 | - ipython_genutils=0.2 8 | - jupyter=1.0.0 9 | - nbconvert=5.2 10 | - nbformat=4.4 11 | - notebook=5.2 12 | - numpy=1.13 13 | - pandas=0.20 14 | - pandocfilters=1.4 15 | - pip=9.0.1 16 | - prompt_toolkit=1.0.15 17 | - ptyprocess=0.5.2=py36_intel_0 18 | - pydaal=2018.0.1.20171012=np113py36_intel_0 19 | - pygments=2.2.0=py36_intel_1 20 | - python=3.6.3 21 | - scikit-learn=0.19.0 22 | - scipy=0.19.1 23 | - pip: 24 | - dvc==0.19.7 25 | - ml-versioning-tools 26 | - mlflow==0.7.0 27 | - nltk==3.3 28 | - nose==1.3.7 29 | - pyfasttext==0.4.5 30 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/external/bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "external-js", 3 | "authors": [ 4 | "Cal Evans ", 5 | "Matthew Setter " 6 | ], 7 | "description": "External file importer for reveal.js", 8 | "version": "1.0.1", 9 | "main": "external/external.js", 10 | "keywords": [ 11 | "reveal.js", 12 | "external.js" 13 | ], 14 | "license": "MIT", 15 | "homepage": "https://github.com/settermjd/external", 16 | "repository": { 17 | "type": "git", 18 | "url": "git://github.com/calevans/external.git" 19 | }, 20 | "ignore": [ 21 | "**/.*", 22 | "node_modules", 23 | "bower_components", 24 | "test", 25 | "tests" 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /resources/setup_project/project/classifier/split.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import List, Tuple 3 | 4 | 5 | def split_dataset(fasttext_data_set: List[str], test_percent: float) -> Tuple[List[str], List[str]]: 6 | """ 7 | Shuffle and split the input data set into a train and a test set 8 | according to the test_percent. 9 | :param fasttext_data_set: data set on fast text format 10 | :param test_percent: percent of test data (ex: 0.10) 11 | :return: test fasttext data set, train fasttext data set 12 | """ 13 | random.shuffle(fasttext_data_set) 14 | split_idx = round(test_percent * len(fasttext_data_set)) 15 | return fasttext_data_set[0: split_idx], fasttext_data_set[split_idx:] 16 | -------------------------------------------------------------------------------- /resources/dvc_playground/user/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | ARG USER_NAME 3 | RUN apt-get update && \ 4 | apt-get -y install openssh-client vim git tree && \ 5 | mkdir -p /var/run/sshd 6 | 7 | COPY private_key /tmp/ 8 | 9 | RUN useradd -d /home/$USER_NAME -m -s /bin/bash $USER_NAME 10 | 11 | RUN mkdir /home/$USER_NAME/.ssh 12 | COPY private_key /home/$USER_NAME/.ssh/id_rsa 13 | RUN chown $USER_NAME:$USER_NAME -R /home/$USER_NAME/ 14 | RUN chmod 600 /home/$USER_NAME/.ssh/id_rsa 15 | 16 | USER $USER_NAME 17 | ENV PATH=$PATH:/home/$USER_NAME/.local/bin/ 18 | RUN pip install --user dvc paramiko 19 | 20 | RUN git config --global user.name $(whoami) 21 | RUN git config --global user.email $(whoami)@example.com 22 | COPY resources /resources 23 | 24 | EXPOSE 22 -------------------------------------------------------------------------------- /resources/setup_project/project/classifier/pre_process.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List 2 | 3 | from nltk import wordpunct_tokenize 4 | 5 | 6 | def tokenize_and_clean_text(text: str) -> str: 7 | return ' '.join([token.lower() for token in wordpunct_tokenize(text) 8 | if token.isalpha() and token.lower()]) 9 | 10 | 11 | def clean_formatting(text: List[str]) -> str: 12 | return tokenize_and_clean_text(' '.join(text)) 13 | 14 | 15 | def preprocess_data(extracted_data: List[Tuple[str, str]]) -> List[str]: 16 | """ 17 | Transform data to get compliant with fasttext expected 18 | format: __label__[label] [text] 19 | """ 20 | return [f'__label__{data[0]} {clean_formatting(data[1])}' for data in extracted_data] 21 | -------------------------------------------------------------------------------- /resources/dvc_playground/remote_dvc/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM debian:stretch 2 | RUN apt-get update && \ 3 | apt-get -y install openssh-server vim && \ 4 | mkdir -p /var/run/sshd 5 | 6 | RUN groupadd ssh_user 7 | RUN useradd -g ssh_user -d /upload -s /bin/bash poney -p azerty 8 | RUN mkdir -p /data/dvc/remote 9 | RUN chown -R root:ssh_user /data/dvc 10 | RUN chown -R poney:ssh_user /data/dvc/remote 11 | RUN chmod ug+w -R /data/dvc/remote 12 | 13 | COPY pub_key /tmp 14 | RUN useradd -g ssh_user -m -d /home/dvc_user -s /bin/bash dvc_user && \ 15 | mkdir -p /home/dvc_user/.ssh/ && \ 16 | cat /tmp/pub_key > /home/dvc_user/.ssh/authorized_keys && \ 17 | chown dvc_user:ssh_user -R /home/dvc_user && \ 18 | chmod 644 /home/dvc_user/.ssh/authorized_keys 19 | 20 | 21 | CMD ["/usr/sbin/sshd", "-D"] -------------------------------------------------------------------------------- /resources/setup_project/project/classifier/helper.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os import makedirs 3 | from os.path import dirname 4 | from typing import List 5 | 6 | 7 | def write_json(json_file: str, data: dict): 8 | """ 9 | Create parent directories if not exist. 10 | Write the json file. 11 | """ 12 | makedirs(dirname(json_file), exist_ok=True) 13 | with open(json_file, 'w') as fd: 14 | json.dump(data, fd) 15 | 16 | 17 | def write_lines_file(file_path: str, data_list: List[str]): 18 | """ 19 | Create parent directories if not exist. 20 | Write the file line by line. 21 | """ 22 | makedirs(dirname(file_path), exist_ok=True) 23 | with open(file_path, 'w') as fd: 24 | fd.writelines(['{}{}'.format(line, '' if line.endswith('\n') else '\n') for line in data_list]) 25 | -------------------------------------------------------------------------------- /resources/dvc_playground/user/resources/steps/concat_files.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import glob 3 | import logging 4 | from argparse import ArgumentParser 5 | from os.path import isdir 6 | 7 | if __name__ == '__main__': 8 | parser = ArgumentParser(description='Concat files from a directory') 9 | parser.add_argument('-i', '--input-dir', required=True, help='Contains files to concat') 10 | parser.add_argument('-o', '--output-file', required=True, help='Result file') 11 | 12 | args = parser.parse_args() 13 | 14 | if not isdir(args.input_dir): 15 | logging.error(f'Not a directory: {args.input_dir}') 16 | else: 17 | with open(args.output_file, 'w') as fd_write: 18 | for file in sorted(glob.glob(f'{args.input_dir}/*.input')): 19 | with open(file, 'r') as fd_read: 20 | fd_write.write(fd_read.read()) 21 | -------------------------------------------------------------------------------- /resources/setup_project/project/classifier/extract.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Tuple 4 | 5 | 6 | def get_json(json_file_path: str) -> dict: 7 | """ 8 | Load json content from a given path 9 | """ 10 | try: 11 | with open(json_file_path, 'r') as fd: 12 | return json.load(fd) 13 | except json.JSONDecodeError: 14 | logging.exception(f'Invalid JSON format for pipeline input: {json_file_path}') 15 | except IOError: 16 | logging.exception(f'Can not open pipeline input: {json_file_path}') 17 | 18 | 19 | def extract_data_from_inputs(json_input_file: str) -> Tuple[int, str]: 20 | """ 21 | Read input file then extract pipeline data as list of tuples 22 | """ 23 | json_content = get_json(json_input_file) 24 | 25 | extracted_data = [(review['ratingOverall'], review['segments']) for review in json_content] 26 | 27 | return extracted_data 28 | -------------------------------------------------------------------------------- /modify_input_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import codecs 3 | import pickle 4 | from os.path import dirname, join 5 | 6 | from sklearn.utils import shuffle 7 | 8 | cache_path = join(dirname(__file__), 'poc', 'data', '20news-bydate_py3.pkz') 9 | 10 | 11 | def shuffle_data(subset: str, cache): 12 | cache[subset].data, cache[subset].target, cache[subset].filenames = shuffle(cache[subset].data, 13 | cache[subset].target, 14 | cache[subset].filenames) 15 | 16 | 17 | with open(cache_path, 'rb') as f: 18 | compressed_content = f.read() 19 | uncompressed_content = codecs.decode(compressed_content, 'zlib_codec') 20 | cache = pickle.loads(uncompressed_content) 21 | 22 | shuffle_data('train', cache) 23 | shuffle_data('test', cache) 24 | 25 | compressed_content = codecs.encode(pickle.dumps(cache), 'zlib_codec') 26 | with open(cache_path, 'wb') as f: 27 | f.write(compressed_content) 28 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/multiplex/master.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 3 | // Don't emit events from inside of notes windows 4 | if ( window.location.search.match( /receiver/gi ) ) { return; } 5 | 6 | var multiplex = Reveal.getConfig().multiplex; 7 | 8 | var socket = io.connect( multiplex.url ); 9 | 10 | function post() { 11 | 12 | var messageData = { 13 | state: Reveal.getState(), 14 | secret: multiplex.secret, 15 | socketId: multiplex.id 16 | }; 17 | 18 | socket.emit( 'multiplex-statechanged', messageData ); 19 | 20 | }; 21 | 22 | // post once the page is loaded, so the client follows also on "open URL". 23 | window.addEventListener( 'load', post ); 24 | 25 | // Monitor events that trigger a change in state 26 | Reveal.addEventListener( 'slidechanged', post ); 27 | Reveal.addEventListener( 'fragmentshown', post ); 28 | Reveal.addEventListener( 'fragmenthidden', post ); 29 | Reveal.addEventListener( 'overviewhidden', post ); 30 | Reveal.addEventListener( 'overviewshown', post ); 31 | Reveal.addEventListener( 'paused', post ); 32 | Reveal.addEventListener( 'resumed', post ); 33 | 34 | }()); 35 | -------------------------------------------------------------------------------- /talks/reveal.js/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing 2 | 3 | Please keep the [issue tracker](http://github.com/hakimel/reveal.js/issues) limited to **bug reports**, **feature requests** and **pull requests**. 4 | 5 | 6 | ### Personal Support 7 | If you have personal support or setup questions the best place to ask those are [StackOverflow](http://stackoverflow.com/questions/tagged/reveal.js). 8 | 9 | 10 | ### Bug Reports 11 | When reporting a bug make sure to include information about which browser and operating system you are on as well as the necessary steps to reproduce the issue. If possible please include a link to a sample presentation where the bug can be tested. 12 | 13 | 14 | ### Pull Requests 15 | - Should follow the coding style of the file you work in, most importantly: 16 | - Tabs to indent 17 | - Single-quoted strings 18 | - Should be made towards the **dev branch** 19 | - Should be submitted from a feature/topic branch (not your master) 20 | 21 | 22 | ### Plugins 23 | Please do not submit plugins as pull requests. They should be maintained in their own separate repository. More information here: https://github.com/hakimel/reveal.js/wiki/Plugin-Guidelines 24 | -------------------------------------------------------------------------------- /talks/reveal.js/css/theme/source/serif.scss: -------------------------------------------------------------------------------- 1 | /** 2 | * A simple theme for reveal.js presentations, similar 3 | * to the default theme. The accent color is brown. 4 | * 5 | * This theme is Copyright (C) 2012-2013 Owen Versteeg, http://owenversteeg.com - it is MIT licensed. 6 | */ 7 | 8 | 9 | // Default mixins and settings ----------------- 10 | @import "../template/mixins"; 11 | @import "../template/settings"; 12 | // --------------------------------------------- 13 | 14 | 15 | 16 | // Override theme settings (see ../template/settings.scss) 17 | $mainFont: 'Palatino Linotype', 'Book Antiqua', Palatino, FreeSerif, serif; 18 | $mainColor: #000; 19 | $headingFont: 'Palatino Linotype', 'Book Antiqua', Palatino, FreeSerif, serif; 20 | $headingColor: #383D3D; 21 | $headingTextShadow: none; 22 | $headingTextTransform: none; 23 | $backgroundColor: #F0F1EB; 24 | $linkColor: #51483D; 25 | $linkColorHover: lighten( $linkColor, 20% ); 26 | $selectionBackgroundColor: #26351C; 27 | 28 | .reveal a { 29 | line-height: 1.3em; 30 | } 31 | 32 | 33 | // Theme template ------------------------------ 34 | @import "../template/theme"; 35 | // --------------------------------------------- 36 | -------------------------------------------------------------------------------- /talks/reveal.js/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2017 Hakim El Hattab, http://hakim.se, and reveal.js contributors 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /talks/reveal.js/plugin/external/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Cal Evans 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /talks/reveal.js/css/theme/template/settings.scss: -------------------------------------------------------------------------------- 1 | // Base settings for all themes that can optionally be 2 | // overridden by the super-theme 3 | 4 | // Background of the presentation 5 | $backgroundColor: #2b2b2b; 6 | 7 | // Primary/body text 8 | $mainFont: 'Lato', sans-serif; 9 | $mainFontSize: 40px; 10 | $mainColor: #eee; 11 | 12 | // Vertical spacing between blocks of text 13 | $blockMargin: 20px; 14 | 15 | // Headings 16 | $headingMargin: 0 0 $blockMargin 0; 17 | $headingFont: 'League Gothic', Impact, sans-serif; 18 | $headingColor: #eee; 19 | $headingLineHeight: 1.2; 20 | $headingLetterSpacing: normal; 21 | $headingTextTransform: uppercase; 22 | $headingTextShadow: none; 23 | $headingFontWeight: normal; 24 | $heading1TextShadow: $headingTextShadow; 25 | 26 | $heading1Size: 3.77em; 27 | $heading2Size: 2.11em; 28 | $heading3Size: 1.55em; 29 | $heading4Size: 1.00em; 30 | 31 | // Links and actions 32 | $linkColor: #13DAEC; 33 | $linkColorHover: lighten( $linkColor, 20% ); 34 | 35 | // Text selection 36 | $selectionBackgroundColor: #FF5E99; 37 | $selectionColor: #fff; 38 | 39 | // Generates the presentation background, can be overridden 40 | // to return a background image or gradient 41 | @mixin bodyBackground() { 42 | background: $backgroundColor; 43 | } -------------------------------------------------------------------------------- /talks/reveal.js/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "reveal.js", 3 | "version": "3.6.0", 4 | "description": "The HTML Presentation Framework", 5 | "homepage": "http://revealjs.com", 6 | "subdomain": "revealjs", 7 | "main": "js/reveal.js", 8 | "scripts": { 9 | "test": "grunt test", 10 | "start": "grunt serve", 11 | "build": "grunt" 12 | }, 13 | "author": { 14 | "name": "Hakim El Hattab", 15 | "email": "hakim.elhattab@gmail.com", 16 | "web": "http://hakim.se" 17 | }, 18 | "repository": { 19 | "type": "git", 20 | "url": "git://github.com/hakimel/reveal.js.git" 21 | }, 22 | "engines": { 23 | "node": ">=4.0.0" 24 | }, 25 | "devDependencies": { 26 | "express": "^4.15.2", 27 | "grunt": "^1.0.1", 28 | "grunt-autoprefixer": "^3.0.4", 29 | "grunt-cli": "^1.2.0", 30 | "grunt-contrib-connect": "^1.0.2", 31 | "grunt-contrib-cssmin": "^2.1.0", 32 | "grunt-contrib-jshint": "^1.1.0", 33 | "grunt-contrib-qunit": "~1.2.0", 34 | "grunt-contrib-uglify": "^2.3.0", 35 | "grunt-contrib-watch": "^1.0.0", 36 | "grunt-sass": "^2.0.0", 37 | "grunt-retire": "^1.0.7", 38 | "grunt-zip": "~0.17.1", 39 | "mustache": "^2.3.0", 40 | "socket.io": "^1.7.3" 41 | }, 42 | "license": "MIT" 43 | } 44 | -------------------------------------------------------------------------------- /talks/reveal.js/lib/css/zenburn.css: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | Zenburn style from voldmar.ru (c) Vladimir Epifanov 4 | based on dark.css by Ivan Sagalaev 5 | 6 | */ 7 | 8 | .hljs { 9 | display: block; 10 | overflow-x: auto; 11 | padding: 0.5em; 12 | background: #3f3f3f; 13 | color: #dcdcdc; 14 | } 15 | 16 | .hljs-keyword, 17 | .hljs-selector-tag, 18 | .hljs-tag { 19 | color: #e3ceab; 20 | } 21 | 22 | .hljs-template-tag { 23 | color: #dcdcdc; 24 | } 25 | 26 | .hljs-number { 27 | color: #8cd0d3; 28 | } 29 | 30 | .hljs-variable, 31 | .hljs-template-variable, 32 | .hljs-attribute { 33 | color: #efdcbc; 34 | } 35 | 36 | .hljs-literal { 37 | color: #efefaf; 38 | } 39 | 40 | .hljs-subst { 41 | color: #8f8f8f; 42 | } 43 | 44 | .hljs-title, 45 | .hljs-name, 46 | .hljs-selector-id, 47 | .hljs-selector-class, 48 | .hljs-section, 49 | .hljs-type { 50 | color: #efef8f; 51 | } 52 | 53 | .hljs-symbol, 54 | .hljs-bullet, 55 | .hljs-link { 56 | color: #dca3a3; 57 | } 58 | 59 | .hljs-deletion, 60 | .hljs-string, 61 | .hljs-built_in, 62 | .hljs-builtin-name { 63 | color: #cc9393; 64 | } 65 | 66 | .hljs-addition, 67 | .hljs-comment, 68 | .hljs-quote, 69 | .hljs-meta { 70 | color: #7f9f7f; 71 | } 72 | 73 | 74 | .hljs-emphasis { 75 | font-style: italic; 76 | } 77 | 78 | .hljs-strong { 79 | font-weight: bold; 80 | } 81 | -------------------------------------------------------------------------------- /talks/reveal.js/css/theme/source/simple.scss: -------------------------------------------------------------------------------- 1 | /** 2 | * A simple theme for reveal.js presentations, similar 3 | * to the default theme. The accent color is darkblue. 4 | * 5 | * This theme is Copyright (C) 2012 Owen Versteeg, https://github.com/StereotypicalApps. It is MIT licensed. 6 | * reveal.js is Copyright (C) 2011-2012 Hakim El Hattab, http://hakim.se 7 | */ 8 | 9 | 10 | // Default mixins and settings ----------------- 11 | @import "../template/mixins"; 12 | @import "../template/settings"; 13 | // --------------------------------------------- 14 | 15 | 16 | 17 | // Include theme-specific fonts 18 | @import url(https://fonts.googleapis.com/css?family=News+Cycle:400,700); 19 | @import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic); 20 | 21 | 22 | // Override theme settings (see ../template/settings.scss) 23 | $mainFont: 'Lato', sans-serif; 24 | $mainColor: #000; 25 | $headingFont: 'News Cycle', Impact, sans-serif; 26 | $headingColor: #000; 27 | $headingTextShadow: none; 28 | $headingTextTransform: none; 29 | $backgroundColor: #fff; 30 | $linkColor: #00008B; 31 | $linkColorHover: lighten( $linkColor, 20% ); 32 | $selectionBackgroundColor: rgba(0, 0, 0, 0.99); 33 | 34 | section.has-dark-background { 35 | &, h1, h2, h3, h4, h5, h6 { 36 | color: #fff; 37 | } 38 | } 39 | 40 | 41 | // Theme template ------------------------------ 42 | @import "../template/theme"; 43 | // --------------------------------------------- -------------------------------------------------------------------------------- /talks/reveal.js/css/theme/source/moon.scss: -------------------------------------------------------------------------------- 1 | /** 2 | * Solarized Dark theme for reveal.js. 3 | * Author: Achim Staebler 4 | */ 5 | 6 | 7 | // Default mixins and settings ----------------- 8 | @import "../template/mixins"; 9 | @import "../template/settings"; 10 | // --------------------------------------------- 11 | 12 | 13 | 14 | // Include theme-specific fonts 15 | @import url(../../lib/font/league-gothic/league-gothic.css); 16 | @import url(https://fonts.googleapis.com/css?family=Lato:400,700,400italic,700italic); 17 | 18 | /** 19 | * Solarized colors by Ethan Schoonover 20 | */ 21 | html * { 22 | color-profile: sRGB; 23 | rendering-intent: auto; 24 | } 25 | 26 | // Solarized colors 27 | $base03: #002b36; 28 | $base02: #073642; 29 | $base01: #586e75; 30 | $base00: #657b83; 31 | $base0: #839496; 32 | $base1: #93a1a1; 33 | $base2: #eee8d5; 34 | $base3: #fdf6e3; 35 | $yellow: #b58900; 36 | $orange: #cb4b16; 37 | $red: #dc322f; 38 | $magenta: #d33682; 39 | $violet: #6c71c4; 40 | $blue: #268bd2; 41 | $cyan: #2aa198; 42 | $green: #859900; 43 | 44 | // Override theme settings (see ../template/settings.scss) 45 | $mainColor: $base1; 46 | $headingColor: $base2; 47 | $headingTextShadow: none; 48 | $backgroundColor: $base03; 49 | $linkColor: $blue; 50 | $linkColorHover: lighten( $linkColor, 20% ); 51 | $selectionBackgroundColor: $magenta; 52 | 53 | 54 | 55 | // Theme template ------------------------------ 56 | @import "../template/theme"; 57 | // --------------------------------------------- 58 | -------------------------------------------------------------------------------- /talks/reveal.js/lib/js/classList.js: -------------------------------------------------------------------------------- 1 | /*! @source http://purl.eligrey.com/github/classList.js/blob/master/classList.js*/ 2 | if(typeof document!=="undefined"&&!("classList" in document.createElement("a"))){(function(j){var a="classList",f="prototype",m=(j.HTMLElement||j.Element)[f],b=Object,k=String[f].trim||function(){return this.replace(/^\s+|\s+$/g,"")},c=Array[f].indexOf||function(q){var p=0,o=this.length;for(;pbody{font-family: sans-serif;}

reveal.js multiplex server.

Generate token'); 38 | res.end(); 39 | }); 40 | stream.on('readable', function() { 41 | stream.pipe(res); 42 | }); 43 | }); 44 | 45 | app.get("/token", function(req,res) { 46 | var ts = new Date().getTime(); 47 | var rand = Math.floor(Math.random()*9999999); 48 | var secret = ts.toString() + rand.toString(); 49 | res.send({secret: secret, socketId: createHash(secret)}); 50 | }); 51 | 52 | var createHash = function(secret) { 53 | var cipher = crypto.createCipher('blowfish', secret); 54 | return(cipher.final('hex')); 55 | }; 56 | 57 | // Actually listen 58 | server.listen( opts.port || null ); 59 | 60 | var brown = '\033[33m', 61 | green = '\033[32m', 62 | reset = '\033[0m'; 63 | 64 | console.log( brown + "reveal.js:" + reset + " Multiplex running on port " + green + opts.port + reset ); -------------------------------------------------------------------------------- /talks/reveal.js/plugin/notes-server/index.js: -------------------------------------------------------------------------------- 1 | var http = require('http'); 2 | var express = require('express'); 3 | var fs = require('fs'); 4 | var io = require('socket.io'); 5 | var Mustache = require('mustache'); 6 | 7 | var app = express(); 8 | var staticDir = express.static; 9 | var server = http.createServer(app); 10 | 11 | io = io(server); 12 | 13 | var opts = { 14 | port : 1947, 15 | baseDir : __dirname + '/../../' 16 | }; 17 | 18 | io.on( 'connection', function( socket ) { 19 | 20 | socket.on( 'new-subscriber', function( data ) { 21 | socket.broadcast.emit( 'new-subscriber', data ); 22 | }); 23 | 24 | socket.on( 'statechanged', function( data ) { 25 | delete data.state.overview; 26 | socket.broadcast.emit( 'statechanged', data ); 27 | }); 28 | 29 | socket.on( 'statechanged-speaker', function( data ) { 30 | delete data.state.overview; 31 | socket.broadcast.emit( 'statechanged-speaker', data ); 32 | }); 33 | 34 | }); 35 | 36 | [ 'css', 'js', 'images', 'plugin', 'lib' ].forEach( function( dir ) { 37 | app.use( '/' + dir, staticDir( opts.baseDir + dir ) ); 38 | }); 39 | 40 | app.get('/', function( req, res ) { 41 | 42 | res.writeHead( 200, { 'Content-Type': 'text/html' } ); 43 | fs.createReadStream( opts.baseDir + '/index.html' ).pipe( res ); 44 | 45 | }); 46 | 47 | app.get( '/notes/:socketId', function( req, res ) { 48 | 49 | fs.readFile( opts.baseDir + 'plugin/notes-server/notes.html', function( err, data ) { 50 | res.send( Mustache.to_html( data.toString(), { 51 | socketId : req.params.socketId 52 | })); 53 | }); 54 | 55 | }); 56 | 57 | // Actually listen 58 | server.listen( opts.port || null ); 59 | 60 | var brown = '\033[33m', 61 | green = '\033[32m', 62 | reset = '\033[0m'; 63 | 64 | var slidesLocation = 'http://localhost' + ( opts.port ? ( ':' + opts.port ) : '' ); 65 | 66 | console.log( brown + 'reveal.js - Speaker Notes' + reset ); 67 | console.log( '1. Open the slides at ' + green + slidesLocation + reset ); 68 | console.log( '2. Click on the link in your JS console to go to the notes page' ); 69 | console.log( '3. Advance through your slides and your notes will advance automatically' ); 70 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/notes-server/client.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | 3 | // don't emit events from inside the previews themselves 4 | if( window.location.search.match( /receiver/gi ) ) { return; } 5 | 6 | var socket = io.connect( window.location.origin ), 7 | socketId = Math.random().toString().slice( 2 ); 8 | 9 | console.log( 'View slide notes at ' + window.location.origin + '/notes/' + socketId ); 10 | 11 | window.open( window.location.origin + '/notes/' + socketId, 'notes-' + socketId ); 12 | 13 | /** 14 | * Posts the current slide data to the notes window 15 | */ 16 | function post() { 17 | 18 | var slideElement = Reveal.getCurrentSlide(), 19 | notesElement = slideElement.querySelector( 'aside.notes' ); 20 | 21 | var messageData = { 22 | notes: '', 23 | markdown: false, 24 | socketId: socketId, 25 | state: Reveal.getState() 26 | }; 27 | 28 | // Look for notes defined in a slide attribute 29 | if( slideElement.hasAttribute( 'data-notes' ) ) { 30 | messageData.notes = slideElement.getAttribute( 'data-notes' ); 31 | } 32 | 33 | // Look for notes defined in an aside element 34 | if( notesElement ) { 35 | messageData.notes = notesElement.innerHTML; 36 | messageData.markdown = typeof notesElement.getAttribute( 'data-markdown' ) === 'string'; 37 | } 38 | 39 | socket.emit( 'statechanged', messageData ); 40 | 41 | } 42 | 43 | // When a new notes window connects, post our current state 44 | socket.on( 'new-subscriber', function( data ) { 45 | post(); 46 | } ); 47 | 48 | // When the state changes from inside of the speaker view 49 | socket.on( 'statechanged-speaker', function( data ) { 50 | Reveal.setState( data.state ); 51 | } ); 52 | 53 | // Monitor events that trigger a change in state 54 | Reveal.addEventListener( 'slidechanged', post ); 55 | Reveal.addEventListener( 'fragmentshown', post ); 56 | Reveal.addEventListener( 'fragmenthidden', post ); 57 | Reveal.addEventListener( 'overviewhidden', post ); 58 | Reveal.addEventListener( 'overviewshown', post ); 59 | Reveal.addEventListener( 'paused', post ); 60 | Reveal.addEventListener( 'resumed', post ); 61 | 62 | // Post the initial state 63 | post(); 64 | 65 | }()); 66 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/print-pdf/print-pdf.js: -------------------------------------------------------------------------------- 1 | /** 2 | * phantomjs script for printing presentations to PDF. 3 | * 4 | * Example: 5 | * phantomjs print-pdf.js "http://revealjs.com?print-pdf" reveal-demo.pdf 6 | * 7 | * @author Manuel Bieh (https://github.com/manuelbieh) 8 | * @author Hakim El Hattab (https://github.com/hakimel) 9 | * @author Manuel Riezebosch (https://github.com/riezebosch) 10 | */ 11 | 12 | // html2pdf.js 13 | var system = require( 'system' ); 14 | 15 | var probePage = new WebPage(); 16 | var printPage = new WebPage(); 17 | 18 | var inputFile = system.args[1] || 'index.html?print-pdf'; 19 | var outputFile = system.args[2] || 'slides.pdf'; 20 | 21 | if( outputFile.match( /\.pdf$/gi ) === null ) { 22 | outputFile += '.pdf'; 23 | } 24 | 25 | console.log( 'Export PDF: Reading reveal.js config [1/4]' ); 26 | 27 | probePage.open( inputFile, function( status ) { 28 | 29 | console.log( 'Export PDF: Preparing print layout [2/4]' ); 30 | 31 | var config = probePage.evaluate( function() { 32 | return Reveal.getConfig(); 33 | } ); 34 | 35 | if( config ) { 36 | 37 | printPage.paperSize = { 38 | width: Math.floor( config.width * ( 1 + config.margin ) ), 39 | height: Math.floor( config.height * ( 1 + config.margin ) ), 40 | border: 0 41 | }; 42 | 43 | printPage.open( inputFile, function( status ) { 44 | console.log( 'Export PDF: Preparing pdf [3/4]') 45 | printPage.evaluate(function() { 46 | Reveal.isReady() ? window.callPhantom() : Reveal.addEventListener( 'pdf-ready', window.callPhantom ); 47 | }); 48 | } ); 49 | 50 | printPage.onCallback = function(data) { 51 | // For some reason we need to "jump the queue" for syntax highlighting to work. 52 | // See: http://stackoverflow.com/a/3580132/129269 53 | setTimeout(function() { 54 | console.log( 'Export PDF: Writing file [4/4]' ); 55 | printPage.render( outputFile ); 56 | console.log( 'Export PDF: Finished successfully!' ); 57 | phantom.exit(); 58 | }, 0); 59 | }; 60 | } 61 | else { 62 | 63 | console.log( 'Export PDF: Unable to read reveal.js config. Make sure the input address points to a reveal.js page.' ); 64 | phantom.exit(1); 65 | 66 | } 67 | } ); 68 | 69 | 70 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/external/external/external.js: -------------------------------------------------------------------------------- 1 | /* 2 | * external.js 3 | * Cal Evans 4 | * (c) Evans Internet Construction Company, Inc. 5 | * Released under the MIT license 6 | * Load external files into a reveal.js presentation. 7 | * 8 | * This is a reveal.js plugin to load external html files. It replaces the 9 | * content of any element with a data-external="file.ext" with the contents 10 | * of file.ext. 11 | * 12 | * This started life as markdown.js. Thank you to whomever wrote it. 13 | * Small mods by JJ Merelo, github.com/JJ 14 | */ 15 | 16 | (function(){ 17 | loadExternal(); 18 | 19 | function loadExternal() { 20 | 21 | var sections = document.querySelectorAll( '[data-external]'); 22 | 23 | for( var i = 0, len = sections.length; i < len; i++ ) { 24 | 25 | var this_section = sections[i]; 26 | 27 | if( this_section.getAttribute( 'data-external' ).length ) { 28 | 29 | var xhr = new XMLHttpRequest(), 30 | url = this_section.getAttribute( 'data-external' ); 31 | 32 | // see https://developer.mozilla.org/en-US/docs/Web/API/element.getAttribute#Notes 33 | xhr.onreadystatechange = function() { 34 | if( xhr.readyState === 4 ) { 35 | // file protocol yields status code 0 (useful for local debug, mobile applications etc.) 36 | if ( ( xhr.status >= 200 && xhr.status < 300 ) || xhr.status === 0 ) { 37 | 38 | this_section.innerHTML = xhr.responseText; 39 | 40 | 41 | } 42 | else { 43 | 44 | this_section.innerHTML = '
' + 45 | 'ERROR: The attempt to fetch ' + url + ' failed with HTTP status ' + xhr.status + '.' + 46 | 'Check your browser\'s JavaScript console for more details.' + 47 | '

Remember that you need to serve the presentation HTML from a HTTP server.

' + 48 | '
'; 49 | 50 | } 51 | } 52 | }; 53 | 54 | xhr.open( 'GET', url, false ); 55 | 56 | try { 57 | xhr.send(); 58 | } 59 | catch ( e ) { 60 | alert( 'Failed to get the file ' + url + '. Make sure that the presentation and the file are served by a HTTP server and the file can be found there. ' + e ); 61 | } 62 | 63 | } 64 | 65 | } 66 | 67 | return; 68 | } 69 | 70 | })(); 71 | -------------------------------------------------------------------------------- /resources/setup_project/solution/configurables/evaluate_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "model_path = '../data/model/classifier.bin'\n", 10 | "dataset_path = '../data/intermediate/test_dataset.txt'\n", 11 | "metrics_path = '../data/result/metrics_test.txt'" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "with open(dataset_path, 'r') as fd:\n", 21 | " test_data_lines = fd.readlines()\n", 22 | "test_data_lines" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import fasttext as ft\n", 32 | "\n", 33 | "model = ft.load_model(model_path)\n", 34 | "result = model.test(dataset_path)" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "metrics = [\n", 44 | " f'Precision@1: {result.precision}',\n", 45 | " f'Recall@1: {result.recall}',\n", 46 | " f'Nb review: {result.nexamples}'\n", 47 | "]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "for line in metrics:\n", 57 | " print(line)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "from classifier.helper import write_lines_file\n", 67 | "\n", 68 | "write_lines_file(metrics_path, metrics)" 69 | ] 70 | } 71 | ], 72 | "metadata": { 73 | "kernelspec": { 74 | "display_name": "Python 3", 75 | "language": "python", 76 | "name": "python3" 77 | }, 78 | "language_info": { 79 | "codemirror_mode": { 80 | "name": "ipython", 81 | "version": 3 82 | }, 83 | "file_extension": ".py", 84 | "mimetype": "text/x-python", 85 | "name": "python", 86 | "nbconvert_exporter": "python", 87 | "pygments_lexer": "ipython3", 88 | "version": "3.6.5" 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 2 93 | } 94 | -------------------------------------------------------------------------------- /resources/setup_project/solution/configurables/extract_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "reviews_path = '../data/input/trip_advisor.json'\n", 10 | "extracted_data_path = '../data/intermediate/extracted_data.json'" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import json\n", 20 | "with open(reviews_path) as fd:\n", 21 | " data = json.load(fd)\n", 22 | "data" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "len(data)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "from classifier.extract import extract_data_from_inputs" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "extracted_data = extract_data_from_inputs(reviews_path)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "extracted_data" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "len(extracted_data)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "from classifier.helper import write_json\n", 77 | "write_json(extracted_data_path, extracted_data)" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "Python 3", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.6.7" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 2 102 | } 103 | -------------------------------------------------------------------------------- /resources/setup_project/project/notebooks/evaluate_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "model_path = '../data/model/classifier.bin'\n", 10 | "dataset_path = '../data/intermediate/test_dataset.txt'" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "with open(dataset_path, 'r') as fd:\n", 20 | " test_data_lines = fd.readlines()\n", 21 | "test_data_lines" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import fasttext as ft\n", 31 | "\n", 32 | "model = ft.load_model(model_path)\n", 33 | "result = model.test(dataset_path)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "metrics = [\n", 43 | " f'Precision@1: {result.precision}',\n", 44 | " f'Recall@1: {result.recall}',\n", 45 | " f'Nb review: {result.nexamples}'\n", 46 | "]" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "for line in metrics:\n", 56 | " print(line)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "from classifier.helper import write_lines_file\n", 66 | "\n", 67 | "write_lines_file('../data/result/metrics.txt', metrics)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [] 76 | } 77 | ], 78 | "metadata": { 79 | "kernelspec": { 80 | "display_name": "Python 3", 81 | "language": "python", 82 | "name": "python3" 83 | }, 84 | "language_info": { 85 | "codemirror_mode": { 86 | "name": "ipython", 87 | "version": 3 88 | }, 89 | "file_extension": ".py", 90 | "mimetype": "text/x-python", 91 | "name": "python", 92 | "nbconvert_exporter": "python", 93 | "pygments_lexer": "ipython3", 94 | "version": "3.6.5" 95 | } 96 | }, 97 | "nbformat": 4, 98 | "nbformat_minor": 2 99 | } 100 | -------------------------------------------------------------------------------- /resources/setup_project/project/notebooks/extract_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "reviews_path = '../data/input/trip_advisor.json'" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import json\n", 19 | "with open(reviews_path) as fd:\n", 20 | " data = json.load(fd)\n", 21 | "data" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "len(data)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from classifier.extract import extract_data_from_inputs" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "extracted_data = extract_data_from_inputs(reviews_path)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "extracted_data" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "len(extracted_data)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from classifier.helper import write_json\n", 76 | "write_json('../data/intermediate/extracted_data.json', extracted_data)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.6.5" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /resources/dummy/step4_convert_octals.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dummy pipeline - step 4: convert octal Ascii code to character" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This step convert an Ascii octal value to the corresponding character." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Parameters\n", 24 | "\"\"\"\n", 25 | ":param str octal_data: path to octal data input file\n", 26 | ":param str char_from_octal: path to converted data from octal output file\n", 27 | ":dvc-in octal_data: ./dummy/data/octal_data.txt\n", 28 | ":dvc-out char_from_octal : ./dummy/data/data_conv_from_octal.txt\n", 29 | "\"\"\"\n", 30 | "# Value of parameters for this Jupyter Notebook only\n", 31 | "# the notebook is in ./dummy/pipeline/notebooks\n", 32 | "octal_data = '../../data/octal_data.txt'\n", 33 | "char_from_octal = '../../data/data_conv_from_octal.txt'" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "with open(octal_data, 'r') as fd:\n", 43 | " data = fd.read()" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "characters = [f\"{d.split('=')[0]}={chr(int(d.split('=')[1], 8))}\" for d in data.split()]" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "with open(char_from_octal, 'w') as fd:\n", 62 | " fd.write(' '.join(characters))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# No effect\n", 72 | "print(characters)" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.6.5" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 2 97 | } 98 | -------------------------------------------------------------------------------- /resources/setup_project/solution/configurables/preprocess_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "extracted_data_path = '../data/intermediate/extracted_data.json'\n", 10 | "preprocessed_data_path = '../data/intermediate/preprocessed_data.json'" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import json\n", 20 | "with open(extracted_data_path) as fd:\n", 21 | " extracted_data = json.load(fd)\n", 22 | "extracted_data" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "from collections import Counter\n", 32 | "nb_review_by_labels = Counter([d[0] for d in extracted_data])\n", 33 | "\n", 34 | "nb_review_by_labels.most_common()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from classifier.pre_process import preprocess_data\n", 44 | "\n", 45 | "preprocessed_data = preprocess_data(extracted_data)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "preprocessed_data" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "from collections import Counter\n", 64 | "nb_review_by_labels = Counter([d.split()[0] for d in preprocessed_data])\n", 65 | "\n", 66 | "nb_review_by_labels.most_common()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "from classifier.helper import write_json\n", 76 | "write_json(preprocessed_data_path, preprocessed_data)" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.6.7" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /resources/setup_project/project/notebooks/preprocess_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "extracted_data_path = '../data/intermediate/extracted_data.json'" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import json\n", 19 | "with open(extracted_data_path) as fd:\n", 20 | " extracted_data = json.load(fd)\n", 21 | "extracted_data" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from collections import Counter\n", 31 | "nb_review_by_labels = Counter([d[0] for d in extracted_data])\n", 32 | "\n", 33 | "nb_review_by_labels.most_common()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from classifier.pre_process import preprocess_data\n", 43 | "\n", 44 | "preprocessed_data = preprocess_data(extracted_data)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "preprocessed_data" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from collections import Counter\n", 63 | "nb_review_by_labels = Counter([d.split()[0] for d in preprocessed_data])\n", 64 | "\n", 65 | "nb_review_by_labels.most_common()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "from classifier.helper import write_json\n", 75 | "write_json('../data/intermediate/preprocessed_data.json', preprocessed_data)\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.6.5" 103 | } 104 | }, 105 | "nbformat": 4, 106 | "nbformat_minor": 2 107 | } 108 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/external/README.md: -------------------------------------------------------------------------------- 1 | # External.js 2 | By: Cal Evans 3 | 4 | (c) 2015 [Evans Internet Construction Company, Inc.](http://eicc.com) 5 | 6 | License: MIT 7 | 8 | ## IMPORTANT NOTE ## 9 | This project serves a very specific purpose and as such I don't usually take PRs or respond to requests for new features. You are welcome to fok it and make it your own. 10 | 11 | You can also check out [this version](https://github.com/janschoepke/reveal_external) whcich does seem to be mantained and the author seems to be open to PRs and responding to issues. 12 | 13 | ## Readme.md ## 14 | This is a plugin for Reveal.js. It allows you to specifiy external files to be loaded into a presentation. I developed it for [Zend](http://zend.com) Training. It allows a course, which may be hundreds of slides, to be broken into modules and managed individually. This allows for a course Subject Matter Expert to be working on one module, while the designer is working on another. 15 | 16 | # Using external.js 17 | Using the plugin is easy. First, register it in your Reveal.initalize block. 18 | 19 | { src: 'plugin/external/external.js', condition: function() { return !!document.querySelector( '[data-external]' ); } }, 20 | 21 | Then simply add an element into your presentation with a data-external attribute. 22 | 23 |
24 | 25 | In my example, I load in all sections, so my main presentation looks like this. 26 | 27 |
28 | 29 | 30 |
31 |
32 |
33 |
34 | 35 |
36 | 37 | A sample of one of the files would look like this: 38 | 39 |
40 |

This is a slide

41 |
    42 |
  • Point 1
  • 43 |
  • Point 2
  • 44 |
  • Point 3
  • 45 |
46 | 47 | 50 |
51 | 52 |
53 |

This is a second slide

54 |

Just to show that you can load multiple slides at a time, this is a second slide.

55 |
56 | 57 | This makes each include file its own sub-module that can be navigated 58 | by the up and down cursor keys as well as the space bar, but modules can be switched by using 59 | left and right. 60 | 61 | You can of course do it differently. You can also still do sub sections for slides within a separate file. Anything that can normally be done in reveal.js, can be done inside of an externally loaded file. 62 | 63 | # Version 64 | - 1.0.0 Initial Release 65 | 66 | # Mantainer 67 | [Cal Evans](https://blog.calevans.com) 68 | -------------------------------------------------------------------------------- /resources/setup_project/project/notebooks/train_data_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "train_dataset_path = '../data/intermediate/train_dataset.txt'\n", 10 | "conf_path = '../data/input/conf.json'" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "with open(train_dataset_path, 'r') as fd:\n", 20 | " train_data_lines = fd.readlines()\n", 21 | "train_data_lines" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import json\n", 31 | "with open(conf_path, 'r') as fd:\n", 32 | " conf = json.load(fd)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "import fasttext as ft\n", 42 | "from tempfile import TemporaryDirectory\n", 43 | "import shutil\n", 44 | "from os import remove, makedirs\n", 45 | "from os.path import join, exists, dirname\n", 46 | "def train(fasttext_data_path: str, fasttext_model_path: str, epochs: int, learning_rate: float):\n", 47 | " with TemporaryDirectory() as tmp_dir:\n", 48 | " # Fasttext automatically add .bin at the end of the output model file name so\n", 49 | " # we use a temporary file to keep control on output file path\n", 50 | " model_tmp_path = join(tmp_dir, 'model')\n", 51 | " ft.supervised(fasttext_data_path, model_tmp_path, lr=learning_rate, epoch=epochs, silent=0)\n", 52 | " if exists(fasttext_model_path):\n", 53 | " remove(fasttext_model_path)\n", 54 | " makedirs(dirname(fasttext_model_path), exist_ok=True)\n", 55 | " shutil.copy(f'{model_tmp_path}.bin', fasttext_model_path)\n", 56 | "\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "train(train_dataset_path, '../data/model/classifier.bin', epochs=conf['epoch'],\n", 66 | " learning_rate=conf['learning_rate'])" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "Python 3", 80 | "language": "python", 81 | "name": "python3" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 3 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython3", 93 | "version": "3.6.5" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 2 98 | } 99 | -------------------------------------------------------------------------------- /resources/dummy/step1_sanitize_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dummy pipeline - step 1: sanitize data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This step extracts a text from an input file then remove not supported characters." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Parameters\n", 24 | "\"\"\"\n", 25 | ":param str input_data: path to input file\n", 26 | ":param str sanitized_data: path to the output file\n", 27 | ":dvc-in input_data: ./dummy/data/dummy_pipeline_feed.txt\n", 28 | ":dvc-out sanitized_data : ./dummy/data/sanitized_data.txt\n", 29 | "\"\"\"\n", 30 | "# Value of parameters for this Jupyter Notebook only\n", 31 | "# the notebook is in ./dummy/pipeline/notebooks\n", 32 | "input_data = '../../data/dummy_pipeline_feed.txt'\n", 33 | "sanitized_data = '../../data/sanitized_data.txt'" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "with open(input_data, 'r') as fd:\n", 43 | " data = fd.read().strip('\\n')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import re\n", 53 | "data = re.sub('[^0-9=\\s]', '', data)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "with open(sanitized_data, 'w') as fd:\n", 63 | " fd.write(data)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# No effect \n", 73 | "\"\"\"\n", 74 | "Cells with \"# No effect\" comment will be ignore for the Python 3 script generation.\n", 75 | "They are used to see intermediate results only in notebooks\n", 76 | "\"\"\"\n", 77 | "# check result\n", 78 | "print(data)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [] 87 | } 88 | ], 89 | "metadata": { 90 | "kernelspec": { 91 | "display_name": "Python 3", 92 | "language": "python", 93 | "name": "python3" 94 | }, 95 | "language_info": { 96 | "codemirror_mode": { 97 | "name": "ipython", 98 | "version": 3 99 | }, 100 | "file_extension": ".py", 101 | "mimetype": "text/x-python", 102 | "name": "python", 103 | "nbconvert_exporter": "python", 104 | "pygments_lexer": "ipython3", 105 | "version": "3.6.5" 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 2 110 | } 111 | -------------------------------------------------------------------------------- /resources/dummy/dummy_pipeline_feed_2.txt: -------------------------------------------------------------------------------- 1 | 14=1A#62- L1G1_8=110A0011 #o111t=11001#01 re74-=GA1100G%1t*+10( Rm#@(2t%F9=G)*154R @ 77=01000_0GG0 i*ps-64=Rt1A10-01)01 (um4@(@4=11t0*FG0101 dot1(2=)*011F+01t)1)1(1@(A +l138@=*111t0-011 orA95=11R%G1(0_0#1A1F sF*i41=0-t%t41_ tt )0=)01+Gt00*)10AR0%(1 am5tR#7=011@1010(_R R#e-t32=t1#A6AF0 ,t6F9G=t1(1R)0_F10F1R1A@* F48R-=11G1010_0 cGo1)07=110%A1+1GFR%1+1 n1-@28%RR=_A110)1@1@11* t(RsF-ec1At14=11)0-1)1%00* @tet35=01*G10++010#1#) u1=0R@1*-110FF100 )rF# 37=)t_%)15A1 @%ad99=(1(@1)t0R01+11 ip7=A011t00%%00G#1 i11(7G=(11+A*011F11 scti1-*3t5=11011%11t %ng1%36*#=11011#11 GG( %el)8)=A0111*001t1 (i9G0)=t010#0000-A tR-)(F._* 11_5*-=+110010t1 tN9=0111100t1) un21=F01A@1%000@)1_1 )c93=#t11101*00 t%nu+1)8=0%+#1#A101#-#R@11*G1*( Rnt_F++c 124=11)R00AF101F lac10)F%6@=11-00011 u%62=(11%t+0*0@)G001 Gs,1A09=R)0R1R01+1G1A1%_(- (@ lo1+F01=111_0100 bo#)rA*8*_)(2=AR#1R#F1(G1#(0F0%_1-##0 ti1#03=1110101 s38tt=15)6* 1F2G1=_1GAA10*11)0*0* t)nRF()on@_ 2-*4=141 orAci65*=_01(0_0000 ( qui3(6%=R15+4 As(, 5@5=1100%1R1-R1 @heF-nd1R@1(3=11100F0%t(+0 (reR*%r8t@3=1101%00-1 *it d_2)3=04t0 Rtt)i)g#n*+-45=110001%%%1 i-%119)+=010(1A11-R1 s(*F%sit_8+0R@#+@=110%)1-1R@10 Am nF_87+=110G111*1R)F e5t8@=0A(100%00R0 que%._5A@%1=11)00t101%R I)nt_)54@=11000+01 e7+G9=1-101111- gAerR(2)F7=1A50F ) n@o46=A1110(A__0G10) n91=110#@100G0 )n+F#iA126=111R001A1 F-_-Gs-A+%*)l22A=14_F*5 *- -125R=111-0-010#F +non 5=001000@00GtG#_% _risu@8@-9(=11(t10011 _s)_#R +Gu()-l_85*=%@R)F#11F10101 R-tr+(i6*3=+1G)11t0010A cie5(3=1_F11G+0011#) sG (#@d%i%A4=*163@ gnGi+s10_0t=11A01GF001 %s+_)Rim.56=%1100101 73=@*0+@100000_ S+#e6@*0*=1@1*+F)0010(t1(_ d@ l@11@G0=1RG110000 o19(=0-*t11_@F0-0R1F0@RG0G bF1(08=1101_101(G o132G=110G0111 r@tiG7+#0#=1A1-0-10R0@)__1 Fs 129=(-1101110 +satG*pi4%7(=1_1F00101 e-n *ut(25(=04+0RR %t* n16*=1R#60 i97=01-01111% s4(1=G0F41- i)t %96=01+110R10 veFn1AR37+*=%1G@101%A%1_t00 eGna@ti%-33=1%*(5)1 sR 81@=1G110100 au102F*-=110100(0 R#c)tA*A%tor92=1+11))0FR1FR+R00 t. AR94=1R110%%)0%0_F-0 Vets@1G-0t@F=040 G#@tA+i+bFul6=01Gt1+00101 um# 1+20=1_1G0@1101 tpo*+122=G010#A110#)*#1_ rtt7F8*t=1F1A000G1F1 t10t4*(=110001#0@R itAo#RGr40=#A040 )G aRli+qF71*=11%0111+F0 u@@aFm+ 31=04R0 eRn_Aim30-=G01100101-F 13=0#4G0 se1)33G*=+01_t011t+t%01 m-pt72-=11G0A0111%_ er 13)4-)*=111010G0 @t*20=01110101 e_mp-u(8(6=11#)1#0100 sF. E3R=151 *tiaAFmt2%6#t=167 98+=010G)A111-G1) vi#t(ae a105G=++010F1#@Ft11@0% Auct_tor eG+6#7@+)-=_-F11t01111 RAros1-_27=11_01(*#0F0F1 . #5F2-%_=%1RA11001(1%*@ FPrae*11=01110100% s(#e@7+5+@=F110@1111A ntA A3_9#=14t(G@5 #)ac -alF-iAq28=--01101111 -uett a(-G68=@#t1#10tR11_R(11t u#+guA%te*, %_qu8(4=110_00+1GF0_ its# a+GcGc+um131=_1_F1011*10 #(san )+130=AA-R11_#(+0##1001( @__n*ib+h.49F=_0100000 @@RU-#t2(=-04A#(A0 F Fdign_@i--ssA##34=-16@0(A im +augRue (116-+=%*110010G0)+ vesti*t-G+_bulu88+R=%111001+@0* m t+()_@emp15=)14%5 oRr f_erAm%Gen1(23-=RA@*1%11011-)0% tu*m.AG F)M(50=110_1_10@1% aR-RecGe112#A=11##F01111G Anas+* %pAort6(1#A_(#*=01000_@%00G@ GtitR+or# nu43=1#0+%100R11 ll5*9=1110*1)11_ AFa n_o_n##G -Ghe_)A%ndre7_6(+=1-R1t#_10%0*10R ri42=0-1@0#0*00_0) t e*#_-)letif-Ge+n%dG.*A+@ RF1t7=162t C)u*-rab6(6*(t@t=11+011*00FA i(t(ur auc@tor% p)h+a_ret%ra FcRo(nsecteFtAu+r. -------------------------------------------------------------------------------- /resources/setup_project/solution/mlvtools/evaluate_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\"\"\"\n", 10 | ":param str model_path: Path to the model input file\n", 11 | ":param str dataset_path: Path to the dataset input file\n", 12 | ":param str metrics_path: Path to the result metrics output file\n", 13 | "\n", 14 | ":dvc-in model_path: ./data/model/classifier.bin\n", 15 | ":dvc-in dataset_path: ./data/intermediate/test_dataset.txt\n", 16 | ":dvc-out metrics_path: ./data/result/metrics_test.txt\n", 17 | "\"\"\"\n", 18 | "# Following code in this cell will not be add in the generated Python script\n", 19 | "# They are values only for notebook purpose\n", 20 | "model_path = '../data/model/classifier.bin'\n", 21 | "dataset_path = '../data/intermediate/test_dataset.txt'\n", 22 | "metrics_path = '../data/result/metrics_test.txt'" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# No effect\n", 32 | "with open(dataset_path, 'r') as fd:\n", 33 | " test_data_lines = fd.readlines()\n", 34 | "test_data_lines" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import fasttext as ft\n", 44 | "\n", 45 | "model = ft.load_model(model_path)\n", 46 | "result = model.test(dataset_path)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "metrics = [\n", 56 | " f'Precision@1: {result.precision}',\n", 57 | " f'Recall@1: {result.recall}',\n", 58 | " f'Nb review: {result.nexamples}'\n", 59 | "]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# No effect\n", 69 | "for line in metrics:\n", 70 | " print(line)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "from classifier.helper import write_lines_file\n", 80 | "\n", 81 | "write_lines_file(metrics_path, metrics)" 82 | ] 83 | } 84 | ], 85 | "metadata": { 86 | "kernelspec": { 87 | "display_name": "Python 3", 88 | "language": "python", 89 | "name": "python3" 90 | }, 91 | "language_info": { 92 | "codemirror_mode": { 93 | "name": "ipython", 94 | "version": 3 95 | }, 96 | "file_extension": ".py", 97 | "mimetype": "text/x-python", 98 | "name": "python", 99 | "nbconvert_exporter": "python", 100 | "pygments_lexer": "ipython3", 101 | "version": "3.6.7" 102 | } 103 | }, 104 | "nbformat": 4, 105 | "nbformat_minor": 2 106 | } 107 | -------------------------------------------------------------------------------- /talks/pyData/draft.md: -------------------------------------------------------------------------------- 1 | Axé la conf sur l'opposition des mondes surtout vers le debut. et rsultat on se rassemble à la fin. 2 | 3 | Overview: 4 | 5 | - Presentation 6 | - Sarah : [petit resumé] + Accroche sur les technos => J'utilise des jupyter notebook 7 | et j'ai besoin de ...pouvoir reproduire facilement, garder de la souplesse, ... 8 | 9 | - Stephanie: [petit résumé] + Accroche Automatisation, Livaison, Tests 10 | J'ai besoin de... un truc qui se lance facilement, qui se package, qui soit reproductible 11 | sur n'importe quel environnement 12 | 13 | 14 | - Why => Notre histoire: en gros un titre stylé pour dire le portage du 15 | poc (multi jupyter executables sur 1 machine) vers la prod (enfin au moins le step d'"industrialisation" du projet) 16 | en mixant monde dev et data scientist 17 | 18 | - long et sinueu chemin du poc vers la prod ... à la croisée des deux mondes 19 | 20 | - POC vs PROD ... vs Data scientist vs Software Developer 21 | 22 | 23 | - The POC: 24 | 25 | - set of notebooks, some data, name versioning, specific server/user 26 | [Show a repo overview] 27 | 28 | - Step 1: express our needs 29 | 30 | - Automation/Scripting (first step) 31 | 32 | ML side : keep using jupyter notebook 33 | Dev side: be able to easily run the tool and version a standardized format under git 34 | tests, CI 35 | 36 | - Reproducibility/Pipelining/Versioning 37 | 38 | => Going further in the automation process 39 | => No loss, be more confident 40 | => Easily perform experiments 41 | => Handle data sharing 42 | 43 | 44 | ML side: be able to experiment, avoid to reproduce time consuming steps, keep tracking data 45 | share with the team. organistation (no more inconstent reference on name versioned notebooks and execution order 46 | and dependencies) 47 | 48 | 49 | Dev side: be able to reproduce any configuration (data + hyperparam + code) on any server 50 | keep tracking the state of the art pipeline for further delivery. be ble to handle client specificities 51 | 52 | 53 | [Schema représentant besoins] 54 | 55 | - Step 2: Organisation start: we need python scripts from jupyter notebooks 56 | 57 | - Existing solutions: nb convert 58 | 59 | - Issues: not parametrized and no effect cells 60 | 61 | - MLV-tools: ipynb_to_python 62 | 63 | 64 | - Step 3: We need to handle data versioning and pipelining 65 | 66 | - Existing solution: git lfs => data ok, pipelining nok 67 | - Existing solution: dvc => data ok, pipelining ok BUT... [ not easy to use and based on bash cmd 68 | mais bonne nouvelle on a deja des scripts] 69 | - example DVC 70 | - montrer pkoi c'est relou 71 | 72 | - MLV-tools: from jupyter notebook to a pipeline step 73 | 74 | 75 | 76 | - REX 77 | 78 | => souplesse expérimentation commerzbank 79 | => perte de données 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /resources/setup_project/solution/mlvtools/extract_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\"\"\"\n", 10 | ":param str reviews_path: Path to the reviews JSON input file\n", 11 | ":param str extracted_data_path: Path to the extracted data output file\n", 12 | " \n", 13 | ":dvc-in reviews_path: ./data/input/trip_advisor.json\n", 14 | ":dvc-out extracted_data_path: ./data/intermediate/extracted_data.json\n", 15 | "\"\"\"\n", 16 | "# Following code in this cell will not be add in the generated Python script\n", 17 | "# They are values only for notebook purpose\n", 18 | "reviews_path = '../data/input/trip_advisor.json'\n", 19 | "extracted_data_path = '../data/intermediate/extracted_data.json'" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# No effect\n", 29 | "import json\n", 30 | "with open(reviews_path) as fd:\n", 31 | " data = json.load(fd)\n", 32 | "data" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# No effect\n", 42 | "len(data)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "from classifier.extract import extract_data_from_inputs" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "extracted_data = extract_data_from_inputs(reviews_path)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "# No effect\n", 70 | "extracted_data" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "# No effect\n", 80 | "len(extracted_data)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "from classifier.helper import write_json\n", 90 | "write_json(extracted_data_path, extracted_data)" 91 | ] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.7" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /resources/setup_project/solution/configurables/train_data_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "train_dataset_path = '../data/intermediate/train_dataset.txt'\n", 10 | "conf_path = '../data/input/conf.json'\n", 11 | "model_path = '../data/model/classifier.bin'" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "with open(train_dataset_path, 'r') as fd:\n", 21 | " train_data_lines = fd.readlines()\n", 22 | "train_data_lines" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import json\n", 32 | "with open(conf_path, 'r') as fd:\n", 33 | " conf = json.load(fd)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "conf" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import fasttext as ft\n", 52 | "from tempfile import TemporaryDirectory\n", 53 | "import shutil\n", 54 | "from os import remove, makedirs\n", 55 | "from os.path import join, exists, dirname\n", 56 | "def train(fasttext_data_path: str, fasttext_model_path: str, epochs: int, learning_rate: float):\n", 57 | " with TemporaryDirectory() as tmp_dir:\n", 58 | " # Fasttext automatically add .bin at the end of the output model file name so\n", 59 | " # we use a temporary file to keep control on output file path\n", 60 | " model_tmp_path = join(tmp_dir, 'model')\n", 61 | " ft.supervised(fasttext_data_path, model_tmp_path, lr=learning_rate, epoch=epochs, silent=0)\n", 62 | " if exists(fasttext_model_path):\n", 63 | " remove(fasttext_model_path)\n", 64 | " makedirs(dirname(fasttext_model_path), exist_ok=True)\n", 65 | " shutil.copy(f'{model_tmp_path}.bin', fasttext_model_path)\n", 66 | "\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "train(train_dataset_path, model_path, \n", 76 | " epochs=conf['epoch'], learning_rate=conf['learning_rate'])" 77 | ] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 3", 83 | "language": "python", 84 | "name": "python3" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 3 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython3", 96 | "version": "3.6.7" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 2 101 | } 102 | -------------------------------------------------------------------------------- /resources/dummy/dummy_pipeline_feed.txt: -------------------------------------------------------------------------------- 1 | L1tA_26=1110100% o32=171 r1G0_5-+=1#100111 e)F@)139=1%100011 )(m 58G=0R0100_0**00 i1A54=G1R)1t0_1100 p(s76#=10F10011 %Rum8FF5((=1RF1100@A11 8A1=11+R10+100 d1F0R4=110_111)(t0 o84=1A_100101G lo3=0R0100+%00@A0 @r A64(=01+1+10G-t#-0@0t0 s11A4F=1%11*0t1%-0FR0 iG53=156 -t 2G7=00F100000 a16=1*45 m12=#1t64 e1A#66=0A*10)AR#1+*101# t,46=@0F*111-0%@111F Rc#93=1100101 on2%6=163t s*et2@8=1GG+44 ct112F=-11_0111)A1+ et33+=040@t ur-1@08R=110_G1*11G1 128=1110011 aAd80)(=110010)1 Ri)p24=011)#-010@0)0 +is-1t7=0110_1110 _cA19=040- inA71=00%*@100FG*0(0RR0) -g 97=-1%1-@00+10G(1 e@(li60=157# _t. 125=1110@1R_0A0 N1_6tF1G=1(101111A unc159%_=(1t1100R%%+G@1A1 _ _n+u()R1*41=A(1101101 nct 91_=@0100A000 la21=011%(001#1%0_ c(-us6=01100@0)0)GF1 ,+1%7@1t=F1_1-t10R0t1-1 @ %l(4*8=G0R(11A#101(AA11 o4)=162 boRr70+=t)R1_45 ti8=1@50 s10)1_=110111#1R #n*o79+=1)1A100*1@0A n o9=011001@*0F1t G+r160-=11-0G1G00)1 @ci 31=0(110F@)1101 qu94G=0+1#00000- i@s-t,13_4@@=1110100R FG+he10=G0t1-t100A100F n69=@01100)0)11 d1%*=157G re106=01000_00 rit5)1=143 d50=04@G0 tFi@gA156=%t1110110 nAi49=*1G4F5 ssi43=00100+0@00 m_ 142=A+*0101111 ne*1%58=1F1)10%0(1A0 @#queA14A8=1100101 .30=1R+--55 + IR95=1)#1*0000R1A_- nt1A13+G=1()%101+1)-10(tR e(g)e92#=A11@10111 r) 15=040 +t%non1A6*2=11011%GtAAA@10 tn1-5)1=11000G%11@ i#s-18=14t4#+% (l34GG#=01G11000(0 42=0)5%F4 no@1+1(8=111t010@)1 n 122@=1110011 r87=11(00001 i129At=011#1*010 s((23=+F011(1010(0 u-117=11t00010 #s 170=#11%01100* uRl1(@36-*=1110@_101 t#r55=tG011_10#100 ic#i*111=1+1000t11 e)FA1)R#16=11010F01 F*s54@)))=G00100000 Gd37=1@45( +Aig1+3F_@2=11-0-0A11@1 n-56=162( R+-iss+8%9=110G010((1 im.tA1t49G=G110010F0 5=1(45F Se121=1%110t010 d77=#11_001R0+1 + tlo13(3=11t0t1001 b(65=011+#*10t01A0 orFt164=11A011_1@0 i2+5)=1GF51_ #)s123=AR0-100)%F__)%000 sa57=#)*0(1+11(1)0%_01A p59=t-01#110100_ i_en #82#=0_R1000_%00 u%t15%5=01#01101 nis145(=1101)11#1 i#%68=165 90=*011101*0 -#ve-(nA%62=011t10010 ena1+35=1tG1Rt01%00#0 tis 1#37=1_)100R010 auGc#7(4=041 tor16*3*=1(101001 . V14#4=@1100)10(1 +e1)10G=0F100000 sAtiA102(=110+%1011 b1#0-0=t110_1*%_+1R11 ttulAu8t8t=1100111 m po3F%*8=(154 rtti45=157 G*At169%=-_1)101111 or131%=R010)11#G11 ali73=01110G100 qu+aRmRF7=143 e61#=040 -ni86=1110011 m s98=0100000%* GemFpe*1t#4G3@*=11_10000 #_FrF29)_=0A111010_1 153=11011G0R1GG ttem%p_-147=1101t100 u_12R4(=1101*GF000 s(.F#107_=110A0110 ( Et)ia@67+=t0110+0100 m v146t=F111%00)R00+ @FitF%a0=0*10*110_01 FAe_ 13=01(10100G0 a1G38=+0101110 uc*109=++-111--R0010# A+torA#11=0010*00GtF*00 eros120F=11-0_1111 @. @)Pr(a7#-F8=+11_A00011 esFtFen140=11(01R111 t#A Ra168@-=110_(11G11( c ali*%5#2*=01@1000#01 @que4F0(=01-10@_+R1%110 F%t Ftau1#19=1110100 )gu(e@,4#4=1R*%%56 _#+ 157G=1100101 qu-i@16-5=@110011R1 s a+t9)6*=11100_G10 ccu*msa*41R=01R10)AG01R01 n Rnib130=0+10+_)1111* h2((%0=01101111 . 39=t15%1 U-t 75=)t0100000 diRgni1*03=%1101001* ss7F2#AG+=_1-51 +imGG aRug3%+5=F)%15R1 ue ve1(4%=01)1001R0@1% Gst@8_R-_3=1101101 ibu7AR4=0-41- -lum- *t%emG*##+pt*oG2*-2F=00F1G00000_ r *ferFm2(=165 @)e66=0R+1101+@)G%1_F_+11 nAtu@Fm. 127=11100A00 Maec1)52=0R1011_11 e%nas@ pRRor4)A7=04FR(0# ttito-r nu*+*@lla99=1R1011)00t% nFon# 150=1101+%@11*__1 h#%etndr*Ge-r1%AF67=1F11010F#*0 iA36(t=16-+0 t e*63=1_45 leifen*d.*R Cur(##abiturR#1*15=11100(-1-t0t Fat-u)ctFor phareRtr%a c(o-n-@-sectGRetur. -------------------------------------------------------------------------------- /resources/dummy/dummy_pipeline_feed_3.txt: -------------------------------------------------------------------------------- 1 | Lo@26=_*#143#+( r)eFm2+7=0110%A*0101R (%%i#A%p94#FA=11+0F1t0A0)AA1 sum14_4=%A0F%+*10GFtFA11%0R1 do1GA10=1(_10_0+@1+A11) l57=_11_1+00R1-GA0 o98=11A_0#)@1_1*1*1 r100=1#1+10G0+1(1 A Rts%i+R%13=*01A110F0(1+1 t a1*1=0%@1_1-0*0-10@1 me1)+12=1t11010G0 t,_# R75+=G1-#10F0(@101 Ac51=G04@1 ons97=#1At*A110@100R- ect81=1)1(@01001@ e3-6=0_%5t-5 %tAur3=*15@1* (* a#-121=%1#+11)00**00 FG@Atdti9R2t=F_111+A*0-(100 p+127(tG=11001@0FA0 iscA109=01-01111 %i28=#+04@0 )n1#)02=1%G10100tF#0 g_ -e%_53=01G0@0FR(F000-A li1t13=1F1010-0(_0 t.t 9)1=1t10%1R110 Nu**%80=t)1F1@A01R0@11# Gn74-A=11*10)010% cGR (0#=A@1%(11A )n(_un7*(9=_G11+*01#111 (c68(=(0111010G la1@37_=1R11-0@0-11 c52A=+0%010+0(001 _#Rus5F9=t*1#1F*10100 ,3(_2@(=-)G04(0*+) l-+oG47=-F01-10*1(1F11 *b30F=%0_010_@00F00R tor1R40=@1%1R0tG1_1R10 *t46=134 _i11F1=F#1101001 s 8G@)5*=-*%+110()0#)%1)1#0 n%138=R110_10*01 Ron41R%)%+=1F54 A*63=)1(110011 or125=%1101100tt c#i14=17t1 tR+ +qu14)5=1-*@11010*0 @i3#_5(=14@2t )s, 3#9=0R11%G%*_@10%000%G R+hen*17R@=157 dreR66G=1100%111 rG12)_GA3=110@111AG1 itF1--15=G110001G%F0 ) GG(d+i(+133=0101+F1%01F)( g119=11#0#1t101A nR#A%iRsR6+G4=11t#@%1+(0011F *Rs(%A#it@m83=+1%1%00111 %_R3+#3=R+A0GF1G110-(01F1 (A+neG10t3=111F0(100R qu4_+0F=1#45 e.8A8=-010%+F@*0R000A I@89-@=-11#-000At1)1 Rnt7=R#011011-00 ege10-4_tt=111A)*G0G1@t00 r( n#71=11+00101 o+%n 4=%163% F*_nis+6+0=01)00_0%00 A-R_l# 1-22=110010)1t n-o1-24=1t1F10(FF000 #n%% A1%31=1t1011t0A*G1 r-is1(--2=01100G0F#01G u(87_=R1#(1G10+F0+10 (s 126=_1100A1(G01 _ultA38=0110+1001 rGi-ci2G4+=0G1)1@_0tR0100R esA134R=t1*1#1t01)10R) 10=00)100000 @digR7-8=11-01@1__1(1) R)nis90=1F%t101)1*1R1)% s@A@R*im+.5G2@=%0t41-@RAt )#@ Sed6(2=(110A01-01G)- lo10FA*A7=Gt0-11101+t0 borGt76))R=0#10+000#0 R)iA@s29=01R-100A_00t1A*+* s4t(9=F04*0 ap_1-3t6_=t#1-11G0*010 %_ien %F8(*2=1A10*1(1#t(10) @Aut 61+=_@t%1t10#1101 n4A#(3=0@(1#101)1t10(# iR)s14AF9=-1110+011 itF12-R##t0@G=0101@1_1+1 % Aven@117F=**-11t0001#*1 )Aen+t101=0#10-0000#-G a(@%t+i1FG35+=1100101 F#s a__58=110_01*%F01-- uc*to-#77t=%1101100 r1-14-=R111010G1# F.70F)=(111+0F111@F #V1)42=1G1+01tR110 es))t*116*R=*0(@RA1t0#t11%1#0 i-Rbu++l130+=)+t()01*0R1)1GG11 Fu#m#96=-+11%)10101( p73=1_1000R01 ort%#Ft1+A4+@F7)A=11%A0111*R1 Gito21=F0_111@0000t% r- (a6t9=0100-_0t00 liq19=(A(-162 Aua*@#m@ 31=0F1*100(00A1+F e(n%56=11R*00011 imG+ t+s+tte13-G9=G1101G)1R11 tm42)=0#t110_10A01 pGer G4)F8=0010F1111@ t*e+mp128=11011R1@G1 %us.%1(18=110%11R)1%@1+@( FEt5)=00#R1)0)0000 iam@65-=_F1t10_@0001A v%G_+*i+72-=t_010_0000- )ta54=@)10100+11 *)eR6R7=110%01AG01 * 25@=01)1_@+*10101# a1=1A_(64 uctor @10A-8=01@-)0111*1 e(r9@9=11100*)-%(*10 #o_s_1%-41F=11R%0(1@001 ._+ PFR@55#G=1100101 raG6=1G41 es_2F**0=0R11@0G)01F01 Rent%2_2=0#11-10RF*)01*R0t-_F F 23(=0110(@11)11 *ac aF%%@li3F4=-01)F1_10F101_ q#ueA%t# 1GA_At8=@001+G@*00000 aF_ugue45(@++)=040RR , quisR a9*=157F% t_A(#G%#c-cu@+ms_(%a)n1Rt5=040 _ nib_h.) U132=11*G0110R0- t dGigniF44F=1G45 ss)Fi#m 14@(8R=#G1*1tt@0110(Gt0 a2=001A0000_*t0 RRAugue# Fv(te)10A6=1110_011+ sAtiG(129=11+0%0G011 bRulu_m temttp8=-163 or *93=111+0#-01_0*)+# #%f(ermeGtn1R0#A*A-5=#1R(1-@100R0_0 _t+_Fum-. *-MRG1(6RF)=011+t(#10100G aece(%@nG-a+As tApo(rt5F0t)=+0RF(4F1 ti#to@-r nAul_l)84*=01000A)%0-0 a Rno14*3#=1100111 n% (F-heGnGd@(reF(ri#*#t1)G46)=@1A10__1*G@111GAA e*leGifend.37=160 @ Cutr8t6+=-1+101(111-_ aAbitur ta_#)uct%o(r -p_h@aretRra c*9)5=@)G1)10#_001t0_# Fonse%t)_ct%(etu-r. -------------------------------------------------------------------------------- /resources/setup_project/solution/configurables/split_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "preprocessed_data_path = '../data/intermediate/preprocessed_data.json'\n", 10 | "train_dataset_path = '../data/intermediate/train_dataset.txt'\n", 11 | "test_dataset_path = '../data/intermediate/test_dataset.txt'\n", 12 | "test_percent = 0.15" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "import json\n", 22 | "with open(preprocessed_data_path, 'r') as fd:\n", 23 | " preprocessed_data = json.load(fd)\n", 24 | "preprocessed_data" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "len(preprocessed_data)" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "from classifier.split import split_dataset\n", 43 | "\n", 44 | "\n", 45 | "test_dataset, train_dataset = split_dataset(preprocessed_data, test_percent)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "len(test_dataset), len(train_dataset)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "test_dataset" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from collections import Counter\n", 73 | "test_review_by_labels = Counter([d.split()[0] for d in test_dataset])\n", 74 | "train_review_by_labels = Counter([d.split()[0] for d in train_dataset])\n", 75 | "\n", 76 | "test_review_by_labels.most_common()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "train_review_by_labels.most_common()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "from classifier.helper import write_lines_file\n", 95 | "\n", 96 | "write_lines_file(train_dataset_path, train_dataset)\n", 97 | "write_lines_file(test_dataset_path, test_dataset)\n" 98 | ] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.6.7" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 2 122 | } 123 | -------------------------------------------------------------------------------- /resources/setup_project/project/notebooks/split_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "preprocessed_data_path = '../data/intermediate/preprocessed_data.json'" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import json\n", 19 | "with open(preprocessed_data_path, 'r') as fd:\n", 20 | " preprocessed_data = json.load(fd)\n", 21 | "preprocessed_data" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "len(preprocessed_data)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from classifier.split import split_dataset\n", 40 | "\n", 41 | "\n", 42 | "test_dataset, train_dataset = split_dataset(preprocessed_data, test_percent=0.15)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "len(test_dataset), len(train_dataset)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "test_dataset" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "from collections import Counter\n", 70 | "test_review_by_labels = Counter([d.split()[0] for d in test_dataset])\n", 71 | "train_review_by_labels = Counter([d.split()[0] for d in train_dataset])\n", 72 | "\n", 73 | "test_review_by_labels.most_common()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "train_review_by_labels.most_common()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from classifier.helper import write_lines_file\n", 92 | "\n", 93 | "write_lines_file('../data/intermediate/train_dataset.txt', train_dataset)\n", 94 | "write_lines_file('../data/intermediate/test_dataset.txt', test_dataset)\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Python 3", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.6.5" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 2 126 | } 127 | -------------------------------------------------------------------------------- /resources/04_Evaluate_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Evaluate the model\n", 8 | "Next, we want to evaluate how well the model is doing, on train and test data. " 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# Parameters\n", 18 | "\"\"\"\n", 19 | ":param str model_file: Path to model file\n", 20 | ":param str data_file: Path to data files\n", 21 | ":param str result_file: Path to file for storing evaluation metrics\n", 22 | ":dvc-in data_file: ./poc/data/data_train_tokenized.csv \n", 23 | ":dvc-in model_file: ./poc/data/fasttext_model.bin \n", 24 | ":dvc-out result_file: ./poc/data/metrics.txt\n", 25 | "\"\"\"\n", 26 | "# Value of parameters for this Jupyter Notebook only\n", 27 | "# the notebook is in ./poc/pipeline/notebooks\n", 28 | "model_file = '../../data/fasttext_model.bin'\n", 29 | "data_file = '../../data/data_train_tokenized.csv'\n", 30 | "result_file = '../../data/metrics.txt'" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "import pandas as pd\n", 40 | "import numpy as np\n", 41 | "from pyfasttext import FastText\n", 42 | "import json" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df = pd.read_csv(data_file)\n", 52 | "df['data'] = df['data'].apply(lambda s: ' '.join(json.loads(s.replace(\"'\", '\"'))))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "model = FastText()\n", 62 | "model.load_model(model_file)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "predicted = pd.DataFrame(model.predict([sentence + '\\n' for sentence in df['data']]), columns=['targetnames'])" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "accuracy = ((predicted != df[['targetnames']]).sum() / len(df)).iloc[0]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "with open(result_file, 'w') as file_desc:\n", 90 | " file_desc.write(f'accuracy {accuracy}\\n')" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.6.5" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 2 122 | } 123 | -------------------------------------------------------------------------------- /resources/setup_project/solution/mlvtools/preprocess_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\"\"\"\n", 10 | ":param str extracted_data_path: Path extracted data input file\n", 11 | ":param str preprocessed_data_path: Path to the preprocessed data output file\n", 12 | "\n", 13 | ":dvc-in extracted_data_path: ./data/intermediate/extracted_data.json\n", 14 | ":dvc-out preprocessed_data_path: ./data/intermediate/preprocessed_data.json\n", 15 | "\"\"\"\n", 16 | "# Following code in this cell will not be add in the generated Python script\n", 17 | "# They are values only for notebook purpose\n", 18 | "extracted_data_path = '../data/intermediate/extracted_data.json'\n", 19 | "preprocessed_data_path = '../data/intermediate/preprocessed_data.json'" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import json\n", 29 | "with open(extracted_data_path) as fd:\n", 30 | " extracted_data = json.load(fd)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# No effect\n", 40 | "extracted_data" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# No effect\n", 50 | "from collections import Counter\n", 51 | "nb_review_by_labels = Counter([d[0] for d in extracted_data])\n", 52 | "\n", 53 | "nb_review_by_labels.most_common()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "from classifier.pre_process import preprocess_data\n", 63 | "\n", 64 | "preprocessed_data = preprocess_data(extracted_data)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# No effect\n", 74 | "preprocessed_data" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# No effect\n", 84 | "from collections import Counter\n", 85 | "nb_review_by_labels = Counter([d.split()[0] for d in preprocessed_data])\n", 86 | "\n", 87 | "nb_review_by_labels.most_common()" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from classifier.helper import write_json\n", 97 | "write_json(preprocessed_data_path, preprocessed_data)" 98 | ] 99 | } 100 | ], 101 | "metadata": { 102 | "kernelspec": { 103 | "display_name": "Python 3", 104 | "language": "python", 105 | "name": "python3" 106 | }, 107 | "language_info": { 108 | "codemirror_mode": { 109 | "name": "ipython", 110 | "version": 3 111 | }, 112 | "file_extension": ".py", 113 | "mimetype": "text/x-python", 114 | "name": "python", 115 | "nbconvert_exporter": "python", 116 | "pygments_lexer": "ipython3", 117 | "version": "3.6.7" 118 | } 119 | }, 120 | "nbformat": 4, 121 | "nbformat_minor": 2 122 | } 123 | -------------------------------------------------------------------------------- /resources/dummy/step3_convert_binaries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dummy pipeline - step 3: convert binary Ascii code to character" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This step convert an Ascii binary value to the corresponding character." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Parameters\n", 24 | "\"\"\"\n", 25 | ":param str binary_data: path to binary data input file\n", 26 | ":param str char_from_bin: path to converted data from binary output file\n", 27 | ":dvc-cmd: dvc run -f $MLV_DVC_META_FILENAME -d ./dummy/data/binary_data.txt \n", 28 | " -o ./dummy/data/data_conv_from_bin.txt\n", 29 | " $MLV_PY_CMD_PATH --binary-data ./dummy/data/binary_data.txt\n", 30 | " --char-from-bin ./dummy/data/data_conv_from_bin.txt\n", 31 | "\"\"\"\n", 32 | "# Value of parameters for this Jupyter Notebook only\n", 33 | "# the notebook is in ./dummy/pipeline/notebooks\n", 34 | "binary_data = '../../data/binary_data.txt'\n", 35 | "char_from_bin = '../../data/data_conv_from_bin.txt'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "> In this case we use **dvc-cmd** instead of **dvc-in** and **dvc-out** but it is just to show it is possible if needed. However it is not recommended due to the verbosity and the risk of error\n", 43 | "\n", 44 | "With **dvc-in** and **dvc-out**:\n", 45 | "\n", 46 | " \"\"\" \n", 47 | " :param str binary_data: path to binary data input file\n", 48 | " :param str char_from_bin: path to converted data from binary output file\n", 49 | " :dvc-in binary_data: ./dummy/data/binary_data.txt\n", 50 | " :dvc-out char_from_bin : ./dummy/data/data_conv_from_bin.txt\n", 51 | " \"\"\"" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "with open(binary_data, 'r') as fd:\n", 61 | " data = fd.read()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "characters = [f\"{d.split('=')[0]}={chr(int(d.split('=')[1], 2))}\" for d in data.split()]" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "with open(char_from_bin, 'w') as fd:\n", 80 | " fd.write(' '.join(characters))" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "# No effect\n", 90 | "print(characters)" 91 | ] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.5" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /resources/dummy/step2_split_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dummy pipeline - step 2: split data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "This step splits data input file in 2 files. One with octal values the other with binaries values." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Parameters\n", 24 | "\"\"\"\n", 25 | ":param str sanitized_data: path to input sanitized data\n", 26 | ":param str octal_data: path to octal data output file\n", 27 | ":param str binary_data: path to binary data output file\n", 28 | ":param int size_bin_data: number of bits in a binary value\n", 29 | ":dvc-in sanitized_data: ./dummy/data/sanitized_data.txt\n", 30 | ":dvc-out octal_data: ./dummy/data/octal_data.txt\n", 31 | ":dvc-out binary_data: ./dummy/data/binary_data.txt\n", 32 | ":dvc-extra: --size-bin-data 8\n", 33 | "\"\"\"\n", 34 | "# Value of parameters for this Jupyter Notebook only\n", 35 | "# the notebook is in ./dummy/pipeline/notebooks\n", 36 | "sanitized_data = '../../data/sanitized_data.txt'\n", 37 | "octal_data = '../../data/octal_data.txt'\n", 38 | "binary_data = '../../data/binary_data.txt'\n", 39 | "size_bin_data = 8" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "> In this case we use **dvc-extra** to provide a parameter which neither an input nor an output (--size-bin-data). " 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "with open(sanitized_data, 'r') as fd:\n", 56 | " data = fd.read()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "binaries = [d for d in data.split() if len(d.split('=')[1]) >= size_bin_data]" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "octals = [d for d in data.split() if len(d.split('=')[1]) == 3]" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "with open(octal_data, 'w') as fd:\n", 84 | " fd.write(' '.join(octals))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "with open(binary_data, 'w') as fd:\n", 94 | " fd.write(' '.join(binaries))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# No effect\n", 104 | "print(binaries)\n", 105 | "print(octals)" 106 | ] 107 | } 108 | ], 109 | "metadata": { 110 | "kernelspec": { 111 | "display_name": "Python 3", 112 | "language": "python", 113 | "name": "python3" 114 | }, 115 | "language_info": { 116 | "codemirror_mode": { 117 | "name": "ipython", 118 | "version": 3 119 | }, 120 | "file_extension": ".py", 121 | "mimetype": "text/x-python", 122 | "name": "python", 123 | "nbconvert_exporter": "python", 124 | "pygments_lexer": "ipython3", 125 | "version": "3.6.5" 126 | } 127 | }, 128 | "nbformat": 4, 129 | "nbformat_minor": 2 130 | } 131 | -------------------------------------------------------------------------------- /resources/03_Classify_text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Classify text\n", 8 | "We are going to train a classifier on the tokenized text input, using the [FastText libary](https://fasttext.cc/). \n", 9 | "\n", 10 | "In addition to the input data file, we give to the command a few hyperparameter values, and we store the binary file representing the learned model as output.\n", 11 | "\n", 12 | "We only learn for a few epochs, to see how the versioning tools work. \n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# Parameters\n", 22 | "\"\"\"\n", 23 | ":param str input_csv_file: Path to input file\n", 24 | ":param str out_model_path: Path to model files\n", 25 | ":param float learning_rate: Learning rate\n", 26 | ":param int epochs: Number of epochs\n", 27 | "\n", 28 | ":dvc-in input_csv_file: ./poc/data/data_train_tokenized.csv\n", 29 | ":dvc-out out_model_path: ./poc/data/fasttext_model.bin\n", 30 | ":dvc-out: ./poc/data/fasttext_model.vec\n", 31 | ":dvc-extra: --learning-rate 0.7 --epochs 20\n", 32 | "\"\"\"\n", 33 | "# Value of parameters for this Jupyter Notebook only\n", 34 | "# the notebook is in ./poc/pipeline/notebooks\n", 35 | "input_csv_file = \"../../data_train_tokenized.csv\"\n", 36 | "out_model_path = '../../data/fasttext_model'\n", 37 | "learning_rate = .7\n", 38 | "epochs = 20" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "import pandas as pd\n", 48 | "import numpy as np\n", 49 | "from collections import Counter\n", 50 | "from pyfasttext import FastText\n", 51 | "import tempfile\n", 52 | "import os" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "df = pd.read_csv(input_csv_file)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import json\n", 71 | "df['data'] = df['data'].apply(lambda s: json.loads(s.replace(\"'\", '\"')))" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "with tempfile.TemporaryDirectory() as tmp_dir:\n", 81 | " tmp_path = os.path.join(tmp_dir, 'unigrams')\n", 82 | " with open(tmp_path, 'w') as f:\n", 83 | " for text, _, lab in df.itertuples(index=False, name=None):\n", 84 | " f.write('__label__{} {}\\n'.format(lab, ' '.join(text)))\n", 85 | " \n", 86 | " model = FastText()\n", 87 | " # Fastext automatically add .bin at the end of the output model file name\n", 88 | " out_model_path = out_model_path.replace('.bin', '')\n", 89 | " model.supervised(input=tmp_path, output=out_model_path, epoch=epochs, lr=learning_rate)" 90 | ] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.6.5" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 2 114 | } 115 | -------------------------------------------------------------------------------- /resources/setup_project/solution/mlvtools/train_data_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\"\"\"\n", 10 | ":param str train_dataset_path: Path to the train data input file\n", 11 | ":param str conf_path: Path to the hyperparameters configuration input file\n", 12 | ":param str model_path: Path to the model output file\n", 13 | "\n", 14 | ":dvc-in train_dataset_path: ./data/intermediate/train_dataset.txt\n", 15 | ":dvc-in conf_path: ./data/input/conf.json\n", 16 | ":dvc-out model_path: ./data/model/classifier.bin\n", 17 | "\"\"\"\n", 18 | "# Following code in this cell will not be add in the generated Python script\n", 19 | "# They are values only for notebook purpose\n", 20 | "train_dataset_path = '../data/intermediate/train_dataset.txt'\n", 21 | "conf_path = '../data/input/conf.json'\n", 22 | "model_path = '../data/model/classifier.bin'" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# No effect\n", 32 | "with open(train_dataset_path, 'r') as fd:\n", 33 | " train_data_lines = fd.readlines()\n", 34 | "train_data_lines" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "import json\n", 44 | "with open(conf_path, 'r') as fd:\n", 45 | " conf = json.load(fd)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# No effect\n", 55 | "conf" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "import fasttext as ft\n", 65 | "from tempfile import TemporaryDirectory\n", 66 | "import shutil\n", 67 | "from os import remove, makedirs\n", 68 | "from os.path import join, exists, dirname\n", 69 | "def train(fasttext_data_path: str, fasttext_model_path: str, epochs: int, learning_rate: float):\n", 70 | " with TemporaryDirectory() as tmp_dir:\n", 71 | " # Fasttext automatically add .bin at the end of the output model file name so\n", 72 | " # we use a temporary file to keep control on output file path\n", 73 | " model_tmp_path = join(tmp_dir, 'model')\n", 74 | " ft.supervised(fasttext_data_path, model_tmp_path, lr=learning_rate, epoch=epochs, silent=0)\n", 75 | " if exists(fasttext_model_path):\n", 76 | " remove(fasttext_model_path)\n", 77 | " makedirs(dirname(fasttext_model_path), exist_ok=True)\n", 78 | " shutil.copy(f'{model_tmp_path}.bin', fasttext_model_path)\n", 79 | "\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "train(train_dataset_path, model_path, \n", 89 | " epochs=conf['epoch'], learning_rate=conf['learning_rate'])" 90 | ] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.6.7" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 2 114 | } 115 | -------------------------------------------------------------------------------- /talks/reveal.js/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | reveal.js 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 23 | 24 | 25 |
26 |
27 |
28 |

From ML experiments to production: versioning and reproducibility with MLV-tools

29 |

Stéphanie Bracaloni and Sarah Diot-Girard

30 | 33 |
34 |
35 |

About us

36 |
    37 |
  • Sarah Diot-Girard :
  • 38 |
  • Stéphanie Bracaloni :
  • 39 |
40 | 48 |
49 |
50 |

Why are we here

51 |

POC vs PROD

52 |

... vs Data scientist

53 |
54 |
55 |

The POC

56 | 57 | 62 |
63 |
64 |

The POC

65 | 66 | 70 |
71 |
72 |

The POC

73 | 74 | 77 |
78 |
79 |

The POC

80 | 81 | 84 |
85 |
86 |
87 | 88 | 89 | 90 | 91 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /resources/03_bis_Classify_text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Classify text with trigrams\n", 8 | "We are going to train a classifier on the tokenized text input, using the [FastText libary](https://fasttext.cc/). \n", 9 | "\n", 10 | "In addition to the input data file, we give to the command a few hyperparameter values, and we store the binary file representing the learned model as output.\n", 11 | "\n", 12 | "We only learn for a few epochs, to see how the versioning tools work. \n", 13 | "\n", 14 | "We feed the neural network trigrams to see how the accuracy improve over using unigrams (single words).\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Parameters\n", 24 | "\"\"\"\n", 25 | ":param str input_csv_file: Path to input file\n", 26 | ":param str out_model_path: Path to model files\n", 27 | ":param float learning_rate: Learning rate\n", 28 | ":param int epochs: Number of epochs\n", 29 | "\n", 30 | ":dvc-in input_csv_file: [REPLACE_CSV_INPUT]\n", 31 | ":dvc-out out_model_path: [REPLACE_MODEL_OUT_BIN_PATH]\n", 32 | ":dvc-out: [REPLACE_MODEL_OUT_VEC_PATH]\n", 33 | ":dvc-extra: --learning-rate 0.7 --epochs 20\n", 34 | "\"\"\"\n", 35 | "# Value of parameters for this Jupyter Notebook only\n", 36 | "# the notebook is in ./poc/pipeline/notebooks\n", 37 | "input_csv_file = \"../../data/data_train_tokenized.csv\"\n", 38 | "out_model_path = '../../data/fasttext_model_bis'\n", 39 | "learning_rate = .7\n", 40 | "epochs = 20" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "import pandas as pd\n", 50 | "import numpy as np\n", 51 | "from pyfasttext import FastText\n", 52 | "import tempfile\n", 53 | "import os" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "df = pd.read_csv(input_csv_file)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import json\n", 72 | "df['data'] = df['data'].apply(lambda s: json.loads(s.replace(\"'\", '\"')))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "with tempfile.TemporaryDirectory() as tmp_dir:\n", 82 | " tmp_path = os.path.join(tmp_dir, 'trigrams')\n", 83 | " with open(tmp_path, 'w') as f:\n", 84 | " for text, _, lab in df.itertuples(index=False, name=None):\n", 85 | " f.write('__label__{} {}\\n'.format(lab, ' '.join(text)))\n", 86 | " \n", 87 | " model = FastText()\n", 88 | " # Fastext automatically add .bin at the end of the output model file name\n", 89 | " out_model_path = out_model_path.replace('.bin', '')\n", 90 | " model.supervised(input=tmp_path, output=out_model_path, epoch=epochs, lr=learning_rate, wordNgrams=3)" 91 | ] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.5" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /tutorial/use_case4.md: -------------------------------------------------------------------------------- 1 | # Use Case 4: Combine Metrics 2 | 3 | One typical use-case in Machine Learning is that of hyper-parameters optimization. We want to train a classifier with various choices 4 | of hyperparameters, using cross-validation to get an accurate estimate of generalisation metrics (accuracy on validation set, but 5 | also possibly F1-scores or other metrics depending on your data). 6 | 7 | Each of those runs will thus generate a set of metrics, and we want to have a unified view on all results to make a decision on 8 | the best set of hyperparameters. We use **MLFlow tracking API** to record and expose results. 9 | 10 | **Requirements**: 11 | 12 | - setup the environment ([tutorial setup](./setup.md)) 13 | - build the pipeline from [Use Case 1: Build and Reproduce a Pipeline](./use_case1.md) 14 | 15 | > Note: it is possible to quickly build the pipeline from Use Case 1 running `make setup` if setup is not done 16 | then `make pipeline1`. Be careful, the pipeline files and DVC meta files will not be committed. 17 | 18 | We want to reuse the split step from the **Use Case 1** pipeline, and then run cross validation to tune hyperparameters. 19 | 20 | `20news-bydate_py3.pkz` = Split => `data_train.csv` = Classif with Cross Validation => ./poc/data/cross_valid_metrics 21 | 22 | ## 1. Create a Cross Validation Step 23 | 24 | 25 | This pipeline step is based on the `05_Tune_hyperparameters_with_crossvalidation.ipynb` **Jupyter Notebook**. 26 | 27 | We use scikit-learn to build a simple pipeline with two hyperparameters: the number of words in the vocabulary for 28 | the bag-of-words encoding, and the regularization parameter for the Logistic Regression classifier. 29 | 30 | For tutorial purpose, we try out a very limited number of values (a more realistic scenario would probably involve 31 | a grid search). In order for the step to execute quite quickly, we only use one repetition of 3-fold cross-validation. 32 | Once again, in real life, you'll probably want to use 10 repetitions of 5-fold or 10-fold cross-validation. 33 | 34 | In this notebook, the output is just the folder containing all metrics results, but you might also want to store 35 | the model trained with the best hyperparameters. That's a nice exercice for you to try ! 36 | 37 | 38 | 39 | ||| 40 | | :--- | :--- | 41 | | **Step Input**: | `./poc/data/data_train.csv` | 42 | ||| 43 | | **Step Outputs**: | `./poc/data/cross_valid_metrics` | 44 | ||| 45 | |**Generated files**:| `./poc/pipeline/steps/mlvtools_05_tune_hyperparameters_with_crossvalidation.py`| 46 | | | `./poc/commands/dvc/mlvtools_05_tune_hyperparameters_with_crossvalidation_dvc`| 47 | 48 | 1. Copy the `05_Tune_hyperparameters_with_crossvalidation.ipynb` from the resources directory to the poc project: 49 | 50 | cp ./resources/05_Tune_hyperparameters_with_crossvalidation.ipynb ./poc/pipeline/notebooks/ 51 | 52 | 2. Continue with usual process 53 | 54 | 55 | # Git versioning 56 | git add ./poc/pipeline/notebooks/05_Tune_hyperparameters_with_crossvalidation.ipynb 57 | git commit -m 'Tutorial: use case 4 step 1 - Add notebook' 58 | 59 | # Convert to Python 3 script 60 | ipynb_to_python -w . -n ./poc/pipeline/notebooks/05_Tune_hyperparameters_with_crossvalidation.ipynb 61 | 62 | # Generate command 63 | gen_dvc -w . -i ./poc/pipeline/steps/mlvtools_05_tune_hyperparameters_with_crossvalidation.py 64 | 65 | # Run 66 | ./poc/commands/dvc/mlvtools_05_tune_hyperparameters_with_crossvalidation_dvc 67 | 68 | # Version the result 69 | git add *.dvc && git add ./poc/pipeline ./poc/commands/ ./poc/data/ 70 | git commit -m 'Tutorial use case 4 step 1: cross validation' 71 | 72 | 73 | 3. Analyse results 74 | 75 | All metrics are logged in **MLflow tracking**. It is possible to visualize them. 76 | 77 | Run: `mlflow ui --file-store ./poc/data/cross_valid_metrics/` 78 | 79 | Go to: [http://127.0.0.1:5000](http://127.0.0.1:5000) 80 | 81 | 82 | You reached the end of this tutorial. 83 | 84 | Or [go back to README](../README.md) 85 | -------------------------------------------------------------------------------- /tutorial/setup.md: -------------------------------------------------------------------------------- 1 | # Tutorial Setup 2 | 3 | This is the setup section for realistic tutorial. 4 | 5 | ## 1. Create Project Structure 6 | 7 | All resource files needed in this tutorial are provided in `ml-poc-version/resources`. 8 | The structure of the project will be created along the tutorial. 9 | 10 | If it is not already done, clone the repository on the tutorial branch. 11 | 12 | git clone -b tutorial https://github.com/peopledoc/mlv-tools-tutorial 13 | cd ml-poc-version 14 | 15 | Create your working branch 16 | 17 | git checkout -b working 18 | 19 | 20 | Create the project base structure. 21 | 22 | make init-struct 23 | 24 | Following structure must be created: 25 | 26 | ├── poc 27 | │   ├── pipeline 28 | │   │   ├── __init__.py 29 | │   │   ├── notebooks # contains Jupyter notebooks (one by pipeline step) 30 | | | └── steps # contains generated configurable Python 3 scripts 31 | | ├── data # contains pipeline data 32 | │   └── commands 33 | │   └── dvc # contains dvc command wrapped in a bash script 34 | ... 35 | ├── resources # contains Jupyter notebooks needed in this tutorial 36 | │   ├── 01_Extract_dataset.ipynb 37 | │   ├── 02_Tokenize_text.ipynb 38 | │   ├── 03_bis_Classify_text.ipynb 39 | │   ├── 03_Classify_text.ipynb 40 | │   └── 04_Evaluate_model.ipynb 41 | ... 42 | 43 | > It is not mandatory to follow this structure, it is just an example for this tutorial. 44 | 45 | ## 2. Prepare Environment 46 | 47 | Create a virtual environment using **conda** or **virtualenv**, then activate it. 48 | Then setup the project. 49 | 50 | make develop 51 | 52 | ## 3. Initialize DVC Project 53 | **DVC** works on top of **git** repositories. Run **DVC** initialization in a **git** 54 | repository directory to create **DVC meta files**. 55 | 56 | dvc init 57 | 58 | The directory `.dvc` should be created in the project root directory. 59 | 60 | Add it under git versioning: 61 | 62 | git commit -m 'Tutorial setup: dvc init' ./.dvc/ 63 | 64 | ## 4. Create MLV-tools Project Configuration 65 | 66 | Using **MLV-tools**, it can be repetitive to repeat output paths parameters for each `ipynb_to_python` 67 | and `gen_dvc` command. 68 | 69 | It is possible to provide a configuration to declare project structure and 70 | let **MLV-tools** generates output path. 71 | (For more information see [documentation](https://github.com/mlflow/mlflow)) 72 | 73 | make mlvtools-conf 74 | 75 | The configuration file `./.mlvtools` should be created. 76 | 77 | Add it under git versioning: 78 | 79 | git add .mlvtools && git commit -m 'Tutorial setup: dvc init' 80 | 81 | ## 5. Add Git Hooks and Filters 82 | 83 | ### 5.1 Automatise Jupyter Notebook Cleanup 84 | 85 | Usually it is not useful to version **Jupyter notebook** embedded outputs. Sometimes it is even forbidden, 86 | if you work on production data for example. To avoid mistakes, use git pre-commit or git filter to cleanup 87 | **Jupyter notebook** outputs. Several tools can do that, 88 | see for example [nbstripout](https://github.com/kynan/nbstripout). 89 | 90 | pip install --upgrade nbstripout 91 | nbstripout --install 92 | 93 | With **nbstripout** git filter, **Jupyter notebook** outputs are cleaned on each branch on check-in. That means 94 | when you will commit a change you will keep outputs into the notebook to continue working. 95 | But those outputs will not be sent to the remote server when you push. 96 | Notebook outputs are also excluded from the git diff. 97 | 98 | ## 6. Get Tutorial Data 99 | 100 | This tutorial is based on data from [20_newsgroup](http://scikit-learn.org/stable/datasets/). 101 | Run the following command to download them. 102 | 103 | make download-data 104 | 105 | Data are stored in `./poc/data/20news-bydate_py3.pkz`. 106 | 107 | 108 | You reached the end of the setup part, see [Use Case 1: Build and Reproduce a Pipeline](./use_case1.md) 109 | 110 | Or [go back to README](../README.md) 111 | -------------------------------------------------------------------------------- /resources/02_Tokenize_text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tokenize text\n", 8 | "The next step in the pipeline is to tokenize the text input, as is usual in Natural Language Processing. In order to do that, we use the word punkt tokenizer provided by NLTK. \n", 9 | "\n", 10 | "We also remove english stopwords (frequent words who add no semantic meaning, such as \"and\", \"is\", \"the\"...). \n", 11 | "\n", 12 | "Each token is also converted to lower-case and non-alphabetic tokens are removed. \n", 13 | "\n", 14 | "In this very simple tutorial example, we do not apply any lemmatization technique." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Parameters\n", 24 | "\"\"\"\n", 25 | ":param str input_csv_file: Path to input file\n", 26 | ":param str output_csv_file: Path to output file\n", 27 | ":dvc-in input_csv_file: ./poc/data/data_train.csv\n", 28 | ":dvc-out output_csv_file: ./poc/data/data_train_tokenized.csv\n", 29 | "\"\"\"\n", 30 | "# Value of parameters for this Jupyter Notebook only\n", 31 | "# the notebook is in ./poc/pipeline/notebooks\n", 32 | "input_csv_file = \"../../data/data_train.csv\"\n", 33 | "output_csv_file = input_csv_file.replace('.csv', '_tokenized.csv')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "from nltk.tokenize import wordpunct_tokenize\n", 45 | "from nltk.corpus import stopwords" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "df = pd.read_csv(input_csv_file)\n", 55 | "df.head()" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "stopswords_english = set(stopwords.words('english'))" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "def tokenize_and_clean_text(s):\n", 74 | " return [token.lower() for token in wordpunct_tokenize(s) if token.isalpha() and token.lower() not in stopswords_english]" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "df = df.dropna()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "df['data'] = df['data'].apply(tokenize_and_clean_text)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# No effect\n", 102 | "df.head()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "df.to_csv(output_csv_file, index=False)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.6.5" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 2 143 | } 144 | -------------------------------------------------------------------------------- /talks/reveal.js/css/print/pdf.css: -------------------------------------------------------------------------------- 1 | /** 2 | * This stylesheet is used to print reveal.js 3 | * presentations to PDF. 4 | * 5 | * https://github.com/hakimel/reveal.js#pdf-export 6 | */ 7 | 8 | * { 9 | -webkit-print-color-adjust: exact; 10 | } 11 | 12 | body { 13 | margin: 0 auto !important; 14 | border: 0; 15 | padding: 0; 16 | float: none !important; 17 | overflow: visible; 18 | } 19 | 20 | html { 21 | width: 100%; 22 | height: 100%; 23 | overflow: visible; 24 | } 25 | 26 | /* Remove any elements not needed in print. */ 27 | .nestedarrow, 28 | .reveal .controls, 29 | .reveal .progress, 30 | .reveal .playback, 31 | .reveal.overview, 32 | .fork-reveal, 33 | .share-reveal, 34 | .state-background { 35 | display: none !important; 36 | } 37 | 38 | h1, h2, h3, h4, h5, h6 { 39 | text-shadow: 0 0 0 #000 !important; 40 | } 41 | 42 | .reveal pre code { 43 | overflow: hidden !important; 44 | font-family: Courier, 'Courier New', monospace !important; 45 | } 46 | 47 | ul, ol, div, p { 48 | visibility: visible; 49 | position: static; 50 | width: auto; 51 | height: auto; 52 | display: block; 53 | overflow: visible; 54 | margin: auto; 55 | } 56 | .reveal { 57 | width: auto !important; 58 | height: auto !important; 59 | overflow: hidden !important; 60 | } 61 | .reveal .slides { 62 | position: static; 63 | width: 100% !important; 64 | height: auto !important; 65 | zoom: 1 !important; 66 | 67 | left: auto; 68 | top: auto; 69 | margin: 0 !important; 70 | padding: 0 !important; 71 | 72 | overflow: visible; 73 | display: block; 74 | 75 | -webkit-perspective: none; 76 | -moz-perspective: none; 77 | -ms-perspective: none; 78 | perspective: none; 79 | 80 | -webkit-perspective-origin: 50% 50%; /* there isn't a none/auto value but 50-50 is the default */ 81 | -moz-perspective-origin: 50% 50%; 82 | -ms-perspective-origin: 50% 50%; 83 | perspective-origin: 50% 50%; 84 | } 85 | 86 | .reveal .slides .pdf-page { 87 | position: relative; 88 | overflow: hidden; 89 | z-index: 1; 90 | 91 | page-break-after: always; 92 | } 93 | 94 | .reveal .slides section { 95 | visibility: visible !important; 96 | display: block !important; 97 | position: absolute !important; 98 | 99 | margin: 0 !important; 100 | padding: 0 !important; 101 | box-sizing: border-box !important; 102 | min-height: 1px; 103 | 104 | opacity: 1 !important; 105 | 106 | -webkit-transform-style: flat !important; 107 | -moz-transform-style: flat !important; 108 | -ms-transform-style: flat !important; 109 | transform-style: flat !important; 110 | 111 | -webkit-transform: none !important; 112 | -moz-transform: none !important; 113 | -ms-transform: none !important; 114 | transform: none !important; 115 | } 116 | 117 | .reveal section.stack { 118 | position: relative !important; 119 | margin: 0 !important; 120 | padding: 0 !important; 121 | page-break-after: avoid !important; 122 | height: auto !important; 123 | min-height: auto !important; 124 | } 125 | 126 | .reveal img { 127 | box-shadow: none; 128 | } 129 | 130 | .reveal .roll { 131 | overflow: visible; 132 | line-height: 1em; 133 | } 134 | 135 | /* Slide backgrounds are placed inside of their slide when exporting to PDF */ 136 | .reveal .slide-background { 137 | display: block !important; 138 | position: absolute; 139 | top: 0; 140 | left: 0; 141 | width: 100%; 142 | height: 100%; 143 | z-index: auto !important; 144 | } 145 | 146 | /* Display slide speaker notes when 'showNotes' is enabled */ 147 | .reveal.show-notes { 148 | max-width: none; 149 | max-height: none; 150 | } 151 | .reveal .speaker-notes-pdf { 152 | display: block; 153 | width: 100%; 154 | height: auto; 155 | max-height: none; 156 | top: auto; 157 | right: auto; 158 | bottom: auto; 159 | left: auto; 160 | z-index: 100; 161 | } 162 | 163 | /* Layout option which makes notes appear on a separate page */ 164 | .reveal .speaker-notes-pdf[data-layout="separate-page"] { 165 | position: relative; 166 | color: inherit; 167 | background-color: transparent; 168 | padding: 20px; 169 | page-break-after: always; 170 | border: 0; 171 | } 172 | 173 | /* Display slide numbers when 'slideNumber' is enabled */ 174 | .reveal .slide-number-pdf { 175 | display: block; 176 | position: absolute; 177 | font-size: 14px; 178 | } 179 | -------------------------------------------------------------------------------- /resources/setup_project/solution/mlvtools/split_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "\"\"\"\n", 10 | ":param str preprocessed_data_path: Path to preprocessed data input file\n", 11 | ":param str train_dataset_path: Path to the train data output file\n", 12 | ":param str test_dataset_path: Path to the test data output file\n", 13 | ":param float test_percent: Percentage of test data (example: 0.15)\n", 14 | " \n", 15 | ":dvc-in preprocessed_data_path: ./data/intermediate/preprocessed_data.json\n", 16 | ":dvc-out train_dataset_path: ./data/intermediate/train_dataset.txt\n", 17 | ":dvc-out test_dataset_path: ./data/intermediate/test_dataset.txt\n", 18 | ":dvc-extra: --test-percent 0.15\n", 19 | "\"\"\"\n", 20 | "# Following code in this cell will not be add in the generated Python script\n", 21 | "# They are values only for notebook purpose\n", 22 | "preprocessed_data_path = '../data/intermediate/preprocessed_data.json'\n", 23 | "train_dataset_path = '../data/intermediate/train_dataset.txt'\n", 24 | "test_dataset_path = '../data/intermediate/test_dataset.txt'\n", 25 | "test_percent = 0.15" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import json\n", 35 | "with open(preprocessed_data_path, 'r') as fd:\n", 36 | " preprocessed_data = json.load(fd)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "# No effect\n", 46 | "preprocessed_data" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# No effect\n", 56 | "len(preprocessed_data)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "from classifier.split import split_dataset\n", 66 | "\n", 67 | "\n", 68 | "test_dataset, train_dataset = split_dataset(preprocessed_data, test_percent)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# No effect\n", 78 | "len(test_dataset), len(train_dataset)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# No effect\n", 88 | "test_dataset" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# No effect\n", 98 | "from collections import Counter\n", 99 | "test_review_by_labels = Counter([d.split()[0] for d in test_dataset])\n", 100 | "train_review_by_labels = Counter([d.split()[0] for d in train_dataset])\n", 101 | "\n", 102 | "test_review_by_labels.most_common()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "# No effect\n", 112 | "train_review_by_labels.most_common()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "from classifier.helper import write_lines_file\n", 122 | "\n", 123 | "write_lines_file(train_dataset_path, train_dataset)\n", 124 | "write_lines_file(test_dataset_path, test_dataset)\n" 125 | ] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.6.7" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 2 149 | } 150 | -------------------------------------------------------------------------------- /resources/dvc_playground/user/resources/inputs/part2.input: -------------------------------------------------------------------------------- 1 | §¨[[D¨rr![€![§$!dr£"?§!°XX!D./&@@%/,###(((%@%((((((((((((((((((((,[¨DD??.#@%%&@@@@#{*@,[£€$dX.@#,£(#?€"?r*@/£[{}Xd[D,%***.,*.*,***,*.{€.,*&&.!°¨£}XX€r"$"{"r}[¨[ 2 | (((((((((((#%@&#(&@@@@@@@@@@@@/(#,°,@(X?"[}£"!€%@("[§€§£$¨,#.¨[{¨$r§",(**,.,,,******.¨[.,*&@#!?r}dr°$dD?¨XDX$r€§?D}€!£r$D€?°X![XD}§!d.,/(%@@%/#&&#####,/(((#@&#( 3 | X£[$€€![}?"r.,,/(%&@@@&%(*,,(########,/((/(#&@@%#%%##%%%%&&@@@@@@&@@@@@&&&#%%###/*(@%r§X€€[€§X{D*@(X$d"€r$}?¨"°£{Dr!?°****.,*.***,**,[{,**&@@&}r§°X$rd[!Dd£r§{[? 4 | #%%##(,..,&%/@,.€rD§!D,..,,,,,,,,,.r"d?r}{{DX}}dX"%/£d{!D§£r{§£d¨¨?[X{$//**.,,,**,,**.?,*,&@@@dD}°£"X""£{d¨[€€¨$£§¨[?€?,#@@@&%#//***/((############(,*(((,.%@&%# 5 | !D?$$rrr£$.(@@@&%%##########(,*###,*((*/(,*@/.***,.?(&.rD€.&@@@%*}¨D€§D°r{"D{{X¨D€?D"?X§°°€°[?¨€¨°"XDd"¨"XX?§XX{°{r{¨d[{*&(*,*,,**.**.°,*,&@@@{}!€D$"X¨$![°!dD}r 6 | $/@&*§[°£{*@@@/€?{§¨¨}X$${"{¨°XXD§"?€!{!?{€$£![rD{¨€€X$$d?D°"d??$?!r?}"?X&@@&**,**,,*,},*,&%@@[£{§[°§!{d§"D{[[£d[[DDr$D€[!""{,%@@@&(,*///*,.*#((((((*.,(/*@/}{}d 7 | €XrXddXdd}£°°"?,/&@@@@&##((((//(#%%,§.(#/&%{€${#&&,{X$}¨[#@@&,d{¨dD€!"X$D!}"[d§"$rDd£?§€€?d¨?!d[§£{°["d€€£D°€{{rXX§XX§€.*@@@%*****,.**.,**&%@@d€X£D{"Dr}![£§r{?" 8 | .££"}/@@@@@*X¨°°"€?$§r[$!§°¨ddX!$§£r°£°}${r£[€€[£}$X?D{°°$[£d{{[dDd¨#&(@&#@&,%#**/*,,*****#/&@[§!{"!€!°€¨{!r?D$ddX"¨"!DX!{§X§°$}Xr}.*(%&@@@&%##@&.[?,%@##@.d,&@* 9 | ?!}££dd?d?X"§{"£[€?€"drd{?!.*(@@@((#@&.*#%%@#.€X¨$D°*@@@@/?¨"[X°??r€DD"°¨D£€°r°[¨}"}?r¨{!r£€£dD!}{X$°€?€!°r°r}[d€§"r&@%/%@@**./&#/(*,***,(#/&@€d}£X![[}}d?X$D¨"D 10 | ¨€,@@@@@@/X¨[€rr}X{°?{"XX¨D°£€}X}X?"X{D[§¨D!€??{¨"dX"[[}d€{°"dD$}!?,@%/@&,./(/..*#(*%/**,%#/&@{{{€€€d£X?}{€°"€°r"°°!¨°D{X€?["$r€€D"¨"€§$d"£d.#@&/,%@@@@@&/"£XX$$ 11 | D"€XXdXr°?§[}D!?[!D§[?§${!§,&&@@@@@#/,}$Dr?Dd¨X°r#@@#@@@@%[£}£d""}°![?!$?X?°¨£""°€$?¨§?[§€{X}?d!££§°€}?§[}!?[d!{§"D/@/@#,}¨((/.".((*#&/*/&#/&@Dr¨d¨$°£?{D$?§dX![ 12 | @@/$.@@@@@*[""§€¨X"£d[{["?$d€$!¨¨DD"¨€!r§r°!}!D?£$?!$d£§€[!"£€¨°X?€%@@#*((##(#(..(*(&%@//@#(&&?r°£{}"d}¨£¨£$"{$°r"§¨{}}€}r°D"!D£r{{{£"D?¨.(@&&@@@*}?!"§°[D$}r".& 13 | r£d[§?{}XrDD}¨r£rX{X?°X.,#@@(*.[£¨}°[{€°!°{rd#@%/!D€.@@@@@@.{rrr$¨dD!°[°D"€r}?X}[°£"D!"€X[¨£"!?XDD}?¨{d![r§X€{{}D§.@@&£d,(((((/@&#*[/@&@@&(#@#D?¨¨€$X{{["§£}§}{€ 14 | $!dd.&@@@@&dD§}r€°§¨}X"D}§""[§?D?°d{r§d{r°§[£$rdrrD°€D[£rD£!{{"¨!d*@@/"?X/((((.X,(%*",@@@&/#@/D?}!rdr¨?{£"X!¨§d}{d{XdX€!D€{[DdDX$D{§*%@&#,"dD§}D§d}}!°[§€?}(@#}D 15 | D!}?!£{dX[}€¨X¨¨r.(@@(.£}§£§d°€¨£"!¨!"§¨§¨(%*Dd$D£[}[&@@@@&.°£!?€!¨$$££¨r}?¨Xd?§d€X"£$"°{r$![?Dd£[d!d[$X$€r°rr$dDX.@@,£!§,(((/.£.(%@/,#@@@%%@(!¨§€rD{}[¨¨§£"$$$! 16 | r£§[?(@@&&%!D{D€¨[!Dd{{?r{r{¨["d°$Xd£d}¨¨X"!X°DD[°"?${€[d!}!€D§rd€$&@(D!§!/(((.$.(*.&%//&@@#((}D§?°![£{{dD"¨§€X[¨°!{°[}r!¨[}°./&@%*§D[€}}[$r?r°rD°"}?XD{$Dd€€{D} 17 | £¨£D€??!}{°*&@/€§}€°}°DX{$}°$°dr€?£Xrrrd"d°§€}?[$$$€?,@@&&&}}€}dDX§r£"D!{}§drrdD§°£}$°[$?"{$!DDr¨[$°d?°°r?°£[XdX}$.&@@@*.X,((/.¨.(*?.(%#&@@#/(X""$}}{¨X?€¨{?"![d 18 | }dX¨"°#@@@&.$°DdD?°"£°r!d{{["XX}X¨?$}[{§}X€}"d[£XD!d£!dd¨°!D"[$[$X.&((@@@&@#((.d.(*§.(#@@&%%##£!€$[r{${d¨¨D€€X§?X§°{}€"X.#@#.}°!X"¨{??{d¨Dr§¨[}$}$!$"£D€!°DD[§{" 19 | [§°d}£*&@%.°D?[{°X§°dX$[X€{°§°$£°£[{?D§!°£?rD"dr£?"X°{¨%@@@.Xd£°D}"?[!{£X§°![X£$[°}[°"r¨D!d}$°D§€§£r}£°°!{{XDX¨}$r,@/X,%@@@%(/.¨.(/!,(,#@@/#((§$¨d°"!dDr{}}dD{£€ 20 | }{D°¨D€D.##.{§r[€"§€"d€§?!!d§}¨"D?Xdd§}XX{X!$°?§"€€$§{"£r€°€€£!DD{,@/"°![.#@#(.!.((.,(.!.&%,((X?!£$§dd${}€!¨$!§[¨°d£(@#.!€¨§?D{?§$dd£?!£}¨?¨§d£°$X?"X?¨°?°£¨{£$¨ 21 | X,#@#{!€![.#&*D°€"€¨rd§°§€X""°D}€Dr$§{X¨?"$¨"!}r£€"D{¨"[r°}€[D°°¨${!?$°dr!"€[£D¨$r[?$"°}["§X§!![§?XX}D°"¨{"r£"£{Xr,@/§[X€{?/@&%€}/(.,(."d.&&#({§€£d{[}"["d¨}}"!{ 22 | $d}§"!§€r!¨[£}"¨![§DD$€}€${°§d$}!"}d[r€!£?D€¨$£X{{§dd$¨{!d}d$"X§}X/@,?£?.$d"(@&*[*(*,(.?£r,&#(r[}{£€{XD$£${${!?*#@*£¨}}d[,@@@,°??"[!d§$°}[$[D?r{§XDd£¨€¨[$€$Xr$° 23 | &.Dr€£!}¨,@@&?d$?$§[}{"€X££{d{°!X"X}§§¨¨DDrX¨{}€¨{£r§D!X¨£§°!$[X¨£?D§[€¨X?€[€$!£"§?€°£?r§Dr§§}D£[{?¨r$$"!d£}§£!"XD.,/(//*¨"[.&@%*,(/,(../@@@#([€!D}!??"[€¨$¨X€/@ 24 | }d£€D{"{¨¨$°Dd£D?r"{d{°$d}$"!X¨£X°[dXX?!d°$d{}${$![!$?}¨XDd.*(%@@%*€XXd$?¨}D°*@@%.((*#&@@*(@%(d££"§$?}dDX£!}?,@/.!¨${$rdr(@@,r§}}¨°!°°{[X[r[$§!¨¨?D{rd§"§}§€X"£" 25 | XXD{§d}£°&@%°§$€$D$r£r{€X$€$}D[r}$§§°dr!["$¨r§[""§d,(&@&&%,X!d{"€¨"r$}!°D?"{r!£rr$"£r€}[$?£{"¨["Dr}£,(%@@&#*,.°£¨"}[§§"!§°}"D¨&@#!/(#&@,¨.@&((€"[X!{dD[d§!}r{/@, 26 | &@%*.}}!{!¨r}r€X¨[$€$DD!X[Xd£€X$§!r"°$[?€¨$",/#&&&%(*.rX£d¨€{$€X¨$}{rd"£¨¨!€D?%@%*(#@&.}.@@%*(r"!¨"°r?X!"§}{[/@,X{?d"¨r.(@@(¨$"!°£XD[}¨°!€r!§"D°X}§°¨¨r[{?{}"./% 27 | .¨§X"Xd/&@@*drX[{?dX[[{?r?"!£€?°D}°€€?[?}€,(&@%*.D[€$"d$§"rd{[d$$¨¨!?{£d[§£"£¨rX}X§€{.*(##%(*,.D"£§[{D!£!€d°£X£"X£°!}§!!{r$D$D%@%#&@&(.,#@@#*(¨¨d?D$}§°¨!?[{D,@( 28 | ?}°rD}§rr}§£§"?£$DX"{!DXX?d.*(%&@@@&#/,..£!{}¨°!rr"D$?[?D}£r¨Dr"dd}{?¨[XD?D$d,@@@&@%,[¨&@*@/*(?$DDD!£"{€$£"X!£*@&[[}€{D!..*.}[r$§DdD"¨££r$!°£$!!"r}$?,#@@&/.!{}d 29 | -------------------------------------------------------------------------------- /talks/reveal.js/Gruntfile.js: -------------------------------------------------------------------------------- 1 | /* global module:false */ 2 | module.exports = function(grunt) { 3 | var port = grunt.option('port') || 8000; 4 | var root = grunt.option('root') || '.'; 5 | 6 | if (!Array.isArray(root)) root = [root]; 7 | 8 | // Project configuration 9 | grunt.initConfig({ 10 | pkg: grunt.file.readJSON('package.json'), 11 | meta: { 12 | banner: 13 | '/*!\n' + 14 | ' * reveal.js <%= pkg.version %> (<%= grunt.template.today("yyyy-mm-dd, HH:MM") %>)\n' + 15 | ' * http://revealjs.com\n' + 16 | ' * MIT licensed\n' + 17 | ' *\n' + 18 | ' * Copyright (C) 2017 Hakim El Hattab, http://hakim.se\n' + 19 | ' */' 20 | }, 21 | 22 | qunit: { 23 | files: [ 'test/*.html' ] 24 | }, 25 | 26 | uglify: { 27 | options: { 28 | banner: '<%= meta.banner %>\n', 29 | screwIE8: false 30 | }, 31 | build: { 32 | src: 'js/reveal.js', 33 | dest: 'js/reveal.min.js' 34 | } 35 | }, 36 | 37 | sass: { 38 | core: { 39 | src: 'css/reveal.scss', 40 | dest: 'css/reveal.css' 41 | }, 42 | themes: { 43 | expand: true, 44 | cwd: 'css/theme/source', 45 | src: ['*.sass', '*.scss'], 46 | dest: 'css/theme', 47 | ext: '.css' 48 | } 49 | }, 50 | 51 | autoprefixer: { 52 | core: { 53 | src: 'css/reveal.css' 54 | } 55 | }, 56 | 57 | cssmin: { 58 | options: { 59 | compatibility: 'ie9' 60 | }, 61 | compress: { 62 | src: 'css/reveal.css', 63 | dest: 'css/reveal.min.css' 64 | } 65 | }, 66 | 67 | jshint: { 68 | options: { 69 | curly: false, 70 | eqeqeq: true, 71 | immed: true, 72 | esnext: true, 73 | latedef: 'nofunc', 74 | newcap: true, 75 | noarg: true, 76 | sub: true, 77 | undef: true, 78 | eqnull: true, 79 | browser: true, 80 | expr: true, 81 | globals: { 82 | head: false, 83 | module: false, 84 | console: false, 85 | unescape: false, 86 | define: false, 87 | exports: false 88 | } 89 | }, 90 | files: [ 'Gruntfile.js', 'js/reveal.js' ] 91 | }, 92 | 93 | connect: { 94 | server: { 95 | options: { 96 | port: port, 97 | base: root, 98 | livereload: true, 99 | open: true, 100 | useAvailablePort: true 101 | } 102 | } 103 | }, 104 | 105 | zip: { 106 | bundle: { 107 | src: [ 108 | 'index.html', 109 | 'css/**', 110 | 'js/**', 111 | 'lib/**', 112 | 'images/**', 113 | 'plugin/**', 114 | '**.md' 115 | ], 116 | dest: 'reveal-js-presentation.zip' 117 | } 118 | }, 119 | 120 | watch: { 121 | js: { 122 | files: [ 'Gruntfile.js', 'js/reveal.js' ], 123 | tasks: 'js' 124 | }, 125 | theme: { 126 | files: [ 127 | 'css/theme/source/*.sass', 128 | 'css/theme/source/*.scss', 129 | 'css/theme/template/*.sass', 130 | 'css/theme/template/*.scss' 131 | ], 132 | tasks: 'css-themes' 133 | }, 134 | css: { 135 | files: [ 'css/reveal.scss' ], 136 | tasks: 'css-core' 137 | }, 138 | html: { 139 | files: root.map(path => path + '/*.html') 140 | }, 141 | markdown: { 142 | files: root.map(path => path + '/*.md') 143 | }, 144 | options: { 145 | livereload: true 146 | } 147 | }, 148 | 149 | retire: { 150 | js: [ 'js/reveal.js', 'lib/js/*.js', 'plugin/**/*.js' ], 151 | node: [ '.' ] 152 | } 153 | 154 | }); 155 | 156 | // Dependencies 157 | grunt.loadNpmTasks( 'grunt-contrib-connect' ); 158 | grunt.loadNpmTasks( 'grunt-contrib-cssmin' ); 159 | grunt.loadNpmTasks( 'grunt-contrib-jshint' ); 160 | grunt.loadNpmTasks( 'grunt-contrib-qunit' ); 161 | grunt.loadNpmTasks( 'grunt-contrib-uglify' ); 162 | grunt.loadNpmTasks( 'grunt-contrib-watch' ); 163 | grunt.loadNpmTasks( 'grunt-autoprefixer' ); 164 | grunt.loadNpmTasks( 'grunt-retire' ); 165 | grunt.loadNpmTasks( 'grunt-sass' ); 166 | grunt.loadNpmTasks( 'grunt-zip' ); 167 | 168 | // Default task 169 | grunt.registerTask( 'default', [ 'css', 'js' ] ); 170 | 171 | // JS task 172 | grunt.registerTask( 'js', [ 'jshint', 'uglify', 'qunit' ] ); 173 | 174 | // Theme CSS 175 | grunt.registerTask( 'css-themes', [ 'sass:themes' ] ); 176 | 177 | // Core framework CSS 178 | grunt.registerTask( 'css-core', [ 'sass:core', 'autoprefixer', 'cssmin' ] ); 179 | 180 | // All CSS 181 | grunt.registerTask( 'css', [ 'sass', 'autoprefixer', 'cssmin' ] ); 182 | 183 | // Package presentation to archive 184 | grunt.registerTask( 'package', [ 'default', 'zip' ] ); 185 | 186 | // Serve presentation locally 187 | grunt.registerTask( 'serve', [ 'connect', 'watch' ] ); 188 | 189 | // Run tests 190 | grunt.registerTask( 'test', [ 'jshint', 'qunit' ] ); 191 | 192 | }; 193 | -------------------------------------------------------------------------------- /resources/dvc_playground/user/resources/inputs/part1.input: -------------------------------------------------------------------------------- 1 | %&.¨§D£}X{!?$€X{?°r}?{ddD¨"°X}£{[/&@%/.r{{€§![£$[d°rr!d!rd?[}€¨?£$?./%&@@%#/,.r{,%@%,°!"[£D£€$Xr}$°{?[€[§?"§€}"£¨[{{[{{°D}d€.&@@@&/(,}(@(,@,*/§¨[r§§$[?r¨£?}r!°* 2 | r¨§§€?Xdr§.,/#&@@@%/,€r{¨}[DD?!°¨£r,&@%,r°§"¨X"?D¨€€"!?[{£€£D¨D[r[$!€!X°D£€"/@@@#{r!}}&#.*@.*/£$X€€}°![rdX€$[°!°}/@/[X°£}"°°X[}§X£}€r§€{""!$.(@@%*[$D¨{"D}X!D}X§ 3 | [}/@/£$§€[r§¨¨}r°$€}€$¨X"}*&@#.X£°rd!rX"X{d?¨$°$§§"[D./%&@@%(*.D{"X¨XDDr{"X{"DX$¨X€$.#@@@(.$£§€X$£¨¨£d£$°[}¨€$d°r§"rD€"[§!XX*@@*!¨r[d.@/€*@.*(¨?°!${d?°?£!¨§!r$° 4 | #&@&%/.!r$X"r"£¨$[€°!€D°[$°X}¨X§"Dd}d}¨*#@@(.?¨"°[£r!D![X°$d¨[{X€"}X}XX?§?r/@(?!€!$$X,@/[*@..(¨X°${°}€!$£§§§dr!X°D$.&&.rr°r[?r["rDX!X£,%&&/r€"X°£§¨€$€[}$"dd}.,( 5 | °?"¨°/@(.[§X[{¨{rD$*&@#.$°§?"r§??[${dD"*(%@@&#*.[§[!$r?§$r"[?{d"°r"[!rrdr"?°¨[?D¨°{§¨§r§§£(@@/¨r}¨D{°?drr!"€d!€°dD°€d?!}d}"..}[?$?d?!.&(.*@.?($!X§X[XXd"[r{°$[§$ 6 | }X€?r°£€"£§¨€D}[°r?"D?°D£!$X?r[¨D{€£Dd£€r![%&.r¨}!¨{}r{$r££"?$}}}d§d$?dr"?D{r}{Xr"€X€"*@%/@.{/?€[dD{X[D!{{}€€°°d[r!D}£.#@%.{[§d,/@&/X}$¨{rDDX§{§D§,#&@%*,{$r£[!d 7 | $XdD§D[€.%@*€/&@%,?$€D€°€¨?$[X*#&@&(.${$rdD{!X}DX¨d$$"¨r€§Dd§$"{°°rrr$}d¨£$DX€d{$§??D?$"X°,@($!$!}?¨£}X$§"X{d}{{r!}}°D["r£?"§?X§"}[d§¨€(@@@.€*?r"¨}£°¨[[d°X$¨°$d 8 | ¨°{¨{X{Dd°?d§¨§X!r¨"£D€r§¨"D!rdd?D¨Xr§[¨§,@&."{°?$!°X}D€dd$§!X}[!{Dd"€${§£dX[X["!§"$§{?€.&@.D.}§d§°}{XX€Xr$€°[}°"Xr¨}D¨r¨!.%@@.€}}d€$"d.,(&@&#*.?§["°DDd$X!!dD°¨ 9 | °§£${XD!°§£€.%#@@@@@@@%(*.§""[Dr£¨!¨D§[¨D}£D}}r¨"$€D$["X£?Dr{°£?£?$D¨?$£{€X[$$Dr§rX¨£"dD.&%°$¨¨d[?£€£r¨°X??!§¨r!r{{"X"}§"$!§["[£[!X[€X€°°,@&.{°}°r£¨[d$XX}X[}$£} 10 | }}€€{dd€?!£¨£XXrrd§}$X¨}€°§€!![?Dd°{"¨[*%@,X!§€{{$°!€d€{!XX§{!$£r}D$"D}D[€§€"[}d!Dd?$X€°{£&@@,£§d$$€¨r£!}?X€}{}X}"D£D!§X°¨$"r£?¨{[$€}r[?{drd¨?r"dX?€€§${£[§"{d§X 11 | ![}drdX""£!!!{d"§r{¨$?§{{[XX£°{¨d!X{°"$¨§¨[?!{"[}£§Dd[X$$X}$°¨!X£¨"¨¨dD!§€}D°}$r°D["d€[%@(£!!$}!{"{d$§$[?"?€"$?[$"X[°{$D!DX"°{d£?§?¨?§[?"$(&&&}!£€$§£{?$d°XD$r$£ 12 | DrDrd£€¨!!$[$"[d!D¨!"[}*(@@@@@@@@#(,.°"?rd°X[!{X""d£?....%&.?Dr!X€¨!DX$£°[D$[$£{[?dDD!{"°d?d£"$°!£X$°?[?$[°}€£§[£"}£¨D?r§°§€{{X{€?Dr§°$D}!}°£¨"X}}¨D$°£d§¨§}?"?$ 13 | €}£°D"r€[¨°!!X$°¨!d?"d}d[°D$§$r"¨§{£"!¨§}€?XX°"§?§D?"¨d¨{{r£§}r§°££.*(&@@@@@@@@@@@@@@%*!§X"}$[£[$r[,%@@@@%@#°€£"$[?££!¨d{X!°r°°°?}r[°°X€¨£d"€{$!"§[{"€§?¨d?€rr°! 14 | D€€?£§"[d?§{r§r".#@@&%(*,/(#####(##%&@@@@#.Xr?¨§r,&&,{£[X(&@*€¨}{r"Xr€}§€§$D"r{D"£¨£dXr€?{"D$$€¨?$d?°°d}€°!€§}!{€r}[¨€[§£X{£}§€"€?"°€§{"¨£D§?.*#####(,$}XD!dD§€D 15 | °§d?€$rr"¨""Xd[°"§"$£$""r(&%/#########(....£?!?"?¨r°$€d!$°!r€°(@@###//#######*,....,#(#@@%@&,$}./@/£$£XD°¨[#&.[£X£[X[}Xd?§XX[r}§}X"?"dr§"DDX£}r§€"§!d{[§?[!!€}}r 16 | "§[€?r§D€?§"*@&###%##(((((/***/((((((((((&&(@%*@&,X§[$?rD§§.&(dr¨!¨!{X[£§"€$$X?!$¨£!!°d§£X€{°[¨r[r"!!§?"°X"$r°rr¨°$?!Xd§$}!{$¨¨{§D¨{€d!D"*@%&@@&%####((((((((,$? 17 | r°![§"{}°"[[${dD?[?{£D[§?,&&##%&@@&#((((((((((,}r{{{?{rXd€.#@&####((//(##((((((((/**,/(/(#@&*&@&/}dr?d{}d}r"(&.?d{d£D?}¨r{£${d"$§X$rrd£°[¨d$r{?[d°D¨¨€£?XD}€!dD" 18 | ,.€€§?"°.(@@###(((((((((((/,.r€[{..,,**////#@%@@#"°dX§£"€$D{.&(€D€r§[}D€$£¨§"$Xrd¨!dd£¨¨!D$D°$§°}°§$$$¨£¨}¨!°r}rDr!?{D€[r$£¨Xr$§{§?£?d?§§¨*@%#((((#&@@%#((((///* 19 | $d!rD}${dr$X}!}$¨[}}d¨}d?{.#@%((((((((#&@@&(/******,./&@@@%#(((#((((((*..[{X.,*****/#(***,.,,*/.£°£D¨{€{¨¨!}"(&.?°[D€D"€}{!¨Dr$§}d€D{!{$"D¨}§§¨X°r£!d"?d?°d"£$€D 20 | /****/,¨X,(((((((((/.D"?.,********/##(###(,&%&%.,d£D¨?{X§§"X[.@/X€}}$?$DD[!{£¨r?DX[D§}X$DD°$$¨{€[d"°{£[¨rr{§§°}XX¨£"{§X£§$¨?}°{§£r£{$££X°r$.#@&(((((((//**/#&@&( 21 | D[¨Xd!dd[£€§{"D"$§°X¨{D£{£[}.#@%#((((/*******/#&@@&%%#(*/((//(////,¨?€,**********,(######/.&%&@@&,€{£€§r£§?$€£%%,"!¨$XD{£¨D£[D"¨?§¨D!?[}{[[£D{€"°r$[dXd[§"r£X§"[ 22 | %%%@@@@@%(.//*(*,}§r,********,,**(#####(((.&%&@#,!$D},*¨°¨°}!€*@@@@@@@@@@@@%#(.°[¨}¨?d!X{°!XX}$}["§["{!¨[¨€!¨D!"}$§D"Xd[}[}??{??£!€¨[?€{!X?r§,#@@#(/**********/% 23 | ["DX§}§X¨{X$°dX{€r£{D[!€"€D£}*((#@%*******/(#%%%%%%%%%&@@@@%(/.¨€£,****,,**.,**/(#(#######(@/%@#.£DdD/@.D¨¨£D$"&@(%**,,**,..*#@@@&/.[§DD?£$Dr"D"£°r?"rXr[€€¨€[?! 24 | %%%%%%#((#&@@@%/****,.**,.**../#(/###,§*(#%&.%@%,¨°}r.&/.$¨{$!€/&#?,*,,****.D,*,*@&(#,¨D"DrX€X{"¨}¨{$¨§€°[[¨d¨?¨}${[£X{D¨¨°¨DX££r€§d"$°D[X$"r./(#@@@@(**/#%%%%%% 25 | d£d}[[{}?€d£{r!!"DXrdd!§°d!$X!.*/&@@@@(#%%%%%%%%%%%%#((((((#&@@@@##(/*****,£X*#((###,",##*@%°#@@(rX!¨d%%,£!£}€?"/@,,*****,*,,**,..#@@@/Drd"€r}}°€Xr"§d?X"$?"??€r 26 | #(((((((((((((#%&@@@&#(((,[£.(####(d§/##&@@@,.&@@.?rd{/@#!£§"DDX[%%.***,*,***.*,.D,/&@@&(§D£}€dXd}¨°£[dD!Dd€£€Xd""r§[€§$°$r€??§X¨??€rr"{?d{€D°d"¨(@*&@&%%%%%%%%% 27 | X!°XD$¨?!}d}!$?¨€$€°°"X}°{$X€rD€!*@%%@@&%%%%#(((((((((((((((((((((#%@@@@@#€,######&%###@@@@@%X,&@(r}D{"%@,£°$[!£€.@(,**.,,,**.**,€,*.*@@@(}D¨{?£§}D{}°d£{}"?}dX{ 28 | ((((((((((((((((((((((#%&@@@###(,*#*/#@@@.?.&({.(&.d{?",%%["}$"Xd{*@***,.*,**.**,!,*.[.&@(,?}[§?[££§€{£rD{{£°[!$£?{}£{}"X?[[}§"{{°°DX€§X[X§££!D"!,@&%@@@@%#((((( 29 | £[d!D°[§°{€¨°€{}[$£°£r§D°"$X€$"![,@&@@%&&@#(((((((((((((((((((((((((((((/£(%##/,/(.,##@%*D¨D/@.[.@(€Dr""*@,§X§X!X{X%%,*,.,***.,**.,*.}r,#@#.°}[X?}¨d$¨§r[D{£d°}D 30 | ((((((((((((((((((((/,X?¨¨[(#/(/,€(@@@%d!dD}{%%"€*@,"{}r¨#&[{"£{"[?.@#**,.*,*,,**,.*."r€.%@*d?°d"!X¨d€D§{"{r°X?§€§£Xr}{ddX{£$r}¨d°{X$°€§€}"}[}rD.(@&##%#(%@#(((( 31 | -------------------------------------------------------------------------------- /tutorial/dvc_overview.md: -------------------------------------------------------------------------------- 1 | Data Version Control 2 | ==================== 3 | 4 | 5 | Overview 6 | --------- 7 | - Each run is tracked and is reproducible 8 | - Each run can be a part of a pipeline 9 | - A complete pipeline is reproducible according to a chosen version 10 | (ie chosen commit) 11 | - The cache mechanisme allows to reproduce sub-pipelines (only part with outdated dependencies) 12 | - Several kind of storage can be configure to handle data file (AWS S3, Azure, 13 | Google Cloud Storage, SSH, HDFS) 14 | 15 | 16 | - Need to be rigorous: inputs and outputs of each run must be explicitly 17 | specified to be handle as dependencies 18 | - Commands can't be run through a Jupyter Notebook 19 | 20 | 21 | 22 | How it works 23 | ------------ 24 | 25 | **DVC** depends on **Git**. You need to have a Git repository and to manage yourself 26 | your *code* versioning. 27 | You must consider **DVC** as a git extension. 28 | 29 | 1. As usual, create a git repository and version your files 30 | 2. Activate DVC (`dvc init`) 31 | 3. Add data files and manage their versioning with DVC (`dvc add [my_file]`). 32 | At this step DVC put data files in its cache and it creates meta files to 33 | identify them. 34 | (see section **Add data file**) 35 | 4. Commit meta files using Git to save a version of a pipeline 36 | 37 | 38 | 39 | Small tutorial 40 | --------------- 41 | 42 | ### Install DVC 43 | 44 | pip install dvc 45 | 46 | ### Setup a git environment 47 | 48 | mkdir test_dvc 49 | cd test_dvc 50 | 51 | git init 52 | # Create a python script which takes a file as input, reads it, writes it in upper case 53 | mkdir code 54 | echo '#!/usr/bin/env python' > code/python_script.py 55 | echo -e "with open('./data/input_file.txt', 'r') as fd, open('./results/output_file.txt', 'w') \ 56 | as wfd:\n wfd.write(fd.read().upper())" >> code/python_script.py 57 | chmod +x ./code/python_script.py 58 | 59 | # Commit you script 60 | git add ./code/python_script.py 61 | git commit -m 'Initialize env' 62 | 63 | ### Setup DVC environment 64 | 65 | # In ./test_dvc (top level directory) 66 | dvc init 67 | git commit -m 'Initialize dvc' 68 | 69 | ### Add a data file 70 | 71 | # Create a data fiel for the exemple 72 | mkdir data 73 | echo "This is a text" > data/input_file.txt 74 | 75 | dvc add data/input_file.txt 76 | 77 | Here it is possible to check meta file is created running `git status data`, real file 78 | is ignored by git `cat ./data/.gitignore` and cache entry is created `ls -la .dvc/cache/` 79 | 80 | # Commit meta files in git 81 | git add . 82 | git commit -m "Add input data file" 83 | 84 | ### Run a step 85 | 86 | dvc run -d [input file] -o [output file] [cmd] 87 | 88 | mkdir results 89 | dvc run -d ./data/input_file.txt -o ./results/output_file.txt ./code/python_script.py 90 | 91 | Check output file and meta file are generated *./results/output_file.txt*, *./output_file.txt.dvc* 92 | 93 | 94 | ### Run a pipeline 95 | A pipeline is composed of several steps, so we need to create at least one more step here. 96 | 97 | # Run an other step and create a pipeline 98 | MY_CMD="cat ./results/output_file.txt | wc -c > ./results/nb_letters.txt" 99 | dvc run -d ./results/output_file.txt -o ./results/nb_letters.txt -f MyPipeline.dvc $MY_CMD 100 | 101 | See the result 102 | 103 | cat ./results/nb_letters.txt 104 | 105 | A this step the file *./MyPipeline.dvc* represent the pipeline for the current version of files and data 106 | 107 | # Reproduce the pipeline 108 | dvc repro MyPipeline.dvc 109 | 110 | Nothing happened because nothing has changed try `dvc repro MyPipeline.dvc -v` 111 | 112 | # Force the pipeline run 113 | dvc repro MyPipeline.dvc -v -f 114 | 115 | git add . 116 | git commit -m 'pipeline creation' 117 | 118 | ### Modify the input and re-run 119 | 120 | echo "new input" >> data/input_file.txt 121 | 122 | dvc repro MyPipeline.dvc -v 123 | 124 | cat ./results/nb_letters.txt 125 | 126 | git commit -am 'New pipeline version' 127 | 128 | 129 | ### See pipelines steps 130 | 131 | dvc pipeline show MyPipeline.dvc 132 | 133 | Need to be rigorous 134 | ------------------- 135 | 136 | - inputs and outputs of each run must be explicitly 137 | specified to be handle as dependencies 138 | - when you modify a data file you need to run the associated step to be able 139 | to version it (or reproduce the whole pipeline using the cache mechanism) 140 | 141 | Various 142 | ------- 143 | 144 | See [Data Version Control documentation](https://github.com/iterative/dvc) 145 | 146 | See [Data Version Control tutorial](https://blog.dataversioncontrol.com/data-version-control-tutorial-9146715eda46) 147 | -------------------------------------------------------------------------------- /resources/dvc_playground/user/resources/steps/decrypt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import logging 4 | from argparse import ArgumentParser 5 | from random import randint 6 | from typing import List 7 | 8 | noise_chars = ['!', '?', '§', 'X', 'r', 'd', 'D', '¨', '$', '£', '"', "€", '°', '}', '{', '['] 9 | 10 | 11 | def add_noise(content_lines: List[str]): 12 | replaced_content = [] 13 | max_line_lenght = len(max(content_lines, key=len)) 14 | for line in content_lines: 15 | line = line.replace('\n', '') 16 | if len(line) < max_line_lenght: 17 | line = line + (max_line_lenght - len(line)) * ' ' 18 | replaced_content.append(''.join([noise_chars[randint(0, len(noise_chars) - 1)] if char == ' ' else char 19 | for char in line])) 20 | return [f'{line}\n' for line in replaced_content] 21 | 22 | 23 | def remove_noise(content_lines: List[str]): 24 | return [''.join(' ' if char in noise_chars else char for char in line) for line in content_lines] 25 | 26 | 27 | def shift_rows(content_lines: List[str], shift: int): 28 | nb_lines = len(content_lines) 29 | return [content_lines[(i + shift) % nb_lines] for i in range(0, nb_lines)] 30 | 31 | 32 | def unshift_rows(content_lines: List[str], shift: int): 33 | nb_lines = len(content_lines) 34 | return [content_lines[(i - shift) % nb_lines] for i in range(0, nb_lines)] 35 | 36 | 37 | def shift_cols(content_lines: List[str], even_shift: int, odd_shift: int): 38 | shifted_content = [] 39 | for idx, line in enumerate(content_lines): 40 | line = line.replace('\n', '') 41 | new_line = '' 42 | if idx % 2 == 0: 43 | for char_id in range(0, len(line)): 44 | new_line += line[(char_id + even_shift) % len(line)] 45 | else: 46 | for char_id in range(0, len(line)): 47 | new_line += line[(char_id + odd_shift) % len(line)] 48 | shifted_content.append(new_line) 49 | 50 | return [f'{line}\n' for line in shifted_content] 51 | 52 | 53 | def unshift_cols(content_lines: List[str], even_shift: int, odd_shift: int): 54 | unshifted_content = [] 55 | for idx, line in enumerate(content_lines): 56 | line = line.replace('\n', '') 57 | new_line = '' 58 | if idx % 2 == 0: 59 | for char_id in range(0, len(line)): 60 | new_line += line[(char_id - even_shift) % len(line)] 61 | else: 62 | for char_id in range(0, len(line)): 63 | new_line += line[(char_id - odd_shift) % len(line)] 64 | unshifted_content.append(new_line) 65 | 66 | return [f'{line}\n' for line in unshifted_content] 67 | 68 | 69 | def encrypt(encrypted_file: str, output_file: str, row_shift: int, col_even_shift: int, col_odd_shift: int): 70 | with open(encrypted_file, 'r') as fd: 71 | content_lines = fd.readlines() 72 | 73 | noisy_content = add_noise(content_lines) 74 | row_shifted = shift_rows(noisy_content, shift=row_shift) 75 | col_shifted = shift_cols(row_shifted, even_shift=col_even_shift, odd_shift=col_odd_shift) 76 | 77 | with open(output_file, 'w') as fd: 78 | fd.writelines(col_shifted) 79 | 80 | 81 | def decrypt(encrypted_file: str, output_file: str, row_shift: int, col_even_shift: int, col_odd_shift: int): 82 | with open(encrypted_file, 'r') as fd: 83 | content_lines = fd.readlines() 84 | 85 | col_unshifted = unshift_cols(content_lines, even_shift=col_even_shift, odd_shift=col_odd_shift) 86 | row_unshifted = unshift_rows(col_unshifted, shift=row_shift) 87 | decrypted_content = remove_noise(row_unshifted) 88 | 89 | with open(output_file, 'w') as fd: 90 | fd.writelines(decrypted_content) 91 | 92 | 93 | if __name__ == '__main__': 94 | parser = ArgumentParser(description='Decrypt file!') 95 | parser.add_argument('-i', '--input-file', required=True) 96 | parser.add_argument('-o', '--output-file', required=True) 97 | parser.add_argument('-p', '--param-file', required=True) 98 | parser.add_argument('-e', '--encrypt', action='store_true') 99 | 100 | args = parser.parse_args() 101 | try: 102 | with open(args.param_file, 'r') as fd: 103 | params = json.load(fd) 104 | 105 | row_shift = params['row_shift'] 106 | col_even_shift = params['col_even_shift'] 107 | col_odd_shift = params['col_odd_shift'] 108 | 109 | if args.encrypt: 110 | encrypt(args.input_file, args.output_file, row_shift, col_even_shift, col_odd_shift) 111 | else: 112 | decrypt(args.input_file, args.output_file, row_shift, col_even_shift, col_odd_shift) 113 | except KeyError as e: 114 | logging.error(f'Parameter error: {e}') 115 | except json.JSONDecodeError as e: 116 | logging.error(f"Parameter file wrongly formated {e}") 117 | except IOError as e: 118 | logging.error(f'IOError: {e}') 119 | 120 | -------------------------------------------------------------------------------- /talks/reveal.js/plugin/markdown/example.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | reveal.js - Markdown Demo 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
18 | 19 |
20 | 21 | 22 |
23 | 24 | 25 |
26 | 36 |
37 | 38 | 39 |
40 | 54 |
55 | 56 | 57 |
58 | 69 |
70 | 71 | 72 |
73 | 77 |
78 | 79 | 80 |
81 | 86 |
87 | 88 | 89 |
90 | 100 |
101 | 102 | 103 |
104 | 107 |
108 | 109 |
110 |
111 | 112 | 113 | 114 | 115 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /resources/05_Tune_hyperparameters_with_crossvalidation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Tune hyperparameters using cross-validation\n", 8 | "\n", 9 | "In this notebook, we will tune hyper-parameters of a simple text classification pipeline. \n", 10 | "\n", 11 | "Starting from the raw text data, we will encode it using bag of words (*hyperparameter 1*: number of words in the vocabulary), and then train a Logisitic Regression classifier (*hyperparameter 2*: regularization parameter). We will evaluate performance using (repeated) cross-validation.\n", 12 | "\n", 13 | "Metrics from each of the run will be stored with **MLFlow tracking API**. That's the output we want to version with **DVC**." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# Parameters\n", 23 | "\"\"\"\n", 24 | ":param str input_csv_file: Path to input file\n", 25 | ":param List[float] C_list: List of inverse of regularisation coefficient values\n", 26 | ":param List[int] max_features_list: List the maximum number of features\n", 27 | ":param str mlflow_output: MLflow metrics directory\n", 28 | ":dvc-in input_csv_file: ./poc/data/data_train.csv\n", 29 | ":dvc-out mlflow_output : ./poc/data/cross_valid_metrics\n", 30 | ":dvc-extra: --C-list .1 1.0 --max-features-list 100 500 1000\n", 31 | "\"\"\"\n", 32 | "# Value of parameters for this Jupyter Notebook only\n", 33 | "# the notebook is in ./poc/pipeline/notebooks\n", 34 | "input_csv_file = \"../../data/data_train.csv\"\n", 35 | "C_list = [.1, 1.0]\n", 36 | "max_features_list = [100, 500, 1000]\n", 37 | "mlflow_output='../../data/cross_valid_metrics'" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "import pandas as pd\n", 47 | "from sklearn.feature_extraction.text import CountVectorizer\n", 48 | "from sklearn.linear_model import LogisticRegression\n", 49 | "from sklearn.pipeline import Pipeline\n", 50 | "from sklearn.model_selection import cross_validate, RepeatedStratifiedKFold\n", 51 | "import mlflow\n", 52 | "from itertools import product" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "df = pd.read_csv(input_csv_file).dropna()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "def log_results(d):\n", 71 | " for metrics, values in d.items():\n", 72 | " mlflow.log_metric(metrics + '_avg', values.mean())\n", 73 | " mlflow.log_metric(metrics + '_std', values.std())" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "mlflow.set_tracking_uri(mlflow_output)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "for C, max_features in product(C_list, max_features_list):\n", 92 | " with mlflow.start_run():\n", 93 | " mlflow.log_param('C', C)\n", 94 | " mlflow.log_param('max_features', max_features)\n", 95 | " classifier = LogisticRegression(C=C,\n", 96 | " solver='lbfgs',\n", 97 | " multi_class='multinomial')\n", 98 | " vectorizer = CountVectorizer(max_features=max_features,\n", 99 | " stop_words='english')\n", 100 | " pipeline = Pipeline([('vectorizer', vectorizer),\n", 101 | " (classifier.__repr__().split('(')[0], classifier)])\n", 102 | " d = cross_validate(pipeline,\n", 103 | " X=df['data'],\n", 104 | " y=df['target'],\n", 105 | " scoring=['accuracy', 'precision_macro', 'f1_micro', 'f1_macro'],\n", 106 | " cv=RepeatedStratifiedKFold(n_splits=3, n_repeats=1, random_state=0))\n", 107 | " log_results(d)\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python [conda env:poc_ml_versioning]", 121 | "language": "python", 122 | "name": "conda-env-poc_ml_versioning-py" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.7.0" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 2 139 | } 140 | --------------------------------------------------------------------------------