├── .gitignore ├── LICENSE.md ├── MANUAL.md ├── README.md ├── docker ├── Dockerfile └── project │ └── test │ ├── Experiment_setup_test.xls │ ├── Experiment_setup_test.xlsx │ ├── results │ ├── Designtable_optimal_Nrun72.csv │ ├── experiment_results_Nrun72.xls │ └── experiment_results_Nrun72.xlsx │ ├── settings_design_test.yaml │ └── settings_expresults_test.yaml ├── docs ├── DoEgen_explained │ ├── 01_experiment_setup_definition_.md │ ├── 02_design_generation_.md │ ├── 03_design_evaluation___efficiency_metrics_.md │ ├── 04_design_selection_.md │ ├── 05_experiment_result_input___merging_.md │ ├── 06_result_analysis___statistics_.md │ ├── 07_result_visualization_.md │ ├── 08_configuration_handling_.md │ └── index.md ├── MANUAL.md └── MANUAL.pdf ├── doegen ├── Experiment_results.xlsx ├── Experiment_setup.xlsx ├── Experiment_setup_extended.xlsx ├── __init__.py ├── configloader.py ├── configloader_results.py ├── create_resultfile.py ├── create_setupfile.py ├── create_setupfile_extended.py ├── doegen.py ├── doeval.py ├── init_config.py ├── init_tests.py ├── settings_design.yaml ├── settings_expresults.yaml └── test │ ├── Experiment_setup_test.xlsx │ ├── results │ ├── Designtable_best_Nrun18.csv │ ├── Designtable_minimum_Nrun30.csv │ ├── Designtable_optimal_Nrun72.csv │ └── experiment_results_Nrun72.xlsx │ ├── settings_design_test.yaml │ └── settings_expresults_test.yaml ├── figures ├── BestFactor_Avg1.png ├── Designtable_optimal_Nrun72.png ├── Efficiencies.png ├── Efficiencies_[3, 3, 3, 3, 3, 3, 2, 2].png ├── Experiment_result_Nrun72_header.png ├── Expresult_correlation_X_1.png ├── Expresult_pairwise-correlation_1.png ├── Result_header.png ├── Results_overview.png ├── Setup_header.png ├── Setup_header_test.png ├── Top10.png ├── Ybarplot_1.png └── pairwise_correlation.png ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | .eggs/* 2 | build/ 3 | *.egg 4 | DoEgen.egg-info/dependency_links.txt 5 | DoEgen.egg-info/PKG-INFO 6 | DoEgen.egg-info/requires.txt 7 | DoEgen.egg-info/SOURCES.txt 8 | DoEgen.egg-info/top_level.txt 9 | build/lib/doegen/doegen.py 10 | build/lib/doegen/doegen.py 11 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ### GNU LESSER GENERAL PUBLIC LICENSE 2 | 3 | Version 3, 29 June 2007 4 | 5 | Copyright (C) 2007 Free Software Foundation, Inc. 6 | 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | This version of the GNU Lesser General Public License incorporates the 12 | terms and conditions of version 3 of the GNU General Public License, 13 | supplemented by the additional permissions listed below. 14 | 15 | #### 0. Additional Definitions. 16 | 17 | As used herein, "this License" refers to version 3 of the GNU Lesser 18 | General Public License, and the "GNU GPL" refers to version 3 of the 19 | GNU General Public License. 20 | 21 | "The Library" refers to a covered work governed by this License, other 22 | than an Application or a Combined Work as defined below. 23 | 24 | An "Application" is any work that makes use of an interface provided 25 | by the Library, but which is not otherwise based on the Library. 26 | Defining a subclass of a class defined by the Library is deemed a mode 27 | of using an interface provided by the Library. 28 | 29 | A "Combined Work" is a work produced by combining or linking an 30 | Application with the Library. The particular version of the Library 31 | with which the Combined Work was made is also called the "Linked 32 | Version". 33 | 34 | The "Minimal Corresponding Source" for a Combined Work means the 35 | Corresponding Source for the Combined Work, excluding any source code 36 | for portions of the Combined Work that, considered in isolation, are 37 | based on the Application, and not on the Linked Version. 38 | 39 | The "Corresponding Application Code" for a Combined Work means the 40 | object code and/or source code for the Application, including any data 41 | and utility programs needed for reproducing the Combined Work from the 42 | Application, but excluding the System Libraries of the Combined Work. 43 | 44 | #### 1. Exception to Section 3 of the GNU GPL. 45 | 46 | You may convey a covered work under sections 3 and 4 of this License 47 | without being bound by section 3 of the GNU GPL. 48 | 49 | #### 2. Conveying Modified Versions. 50 | 51 | If you modify a copy of the Library, and, in your modifications, a 52 | facility refers to a function or data to be supplied by an Application 53 | that uses the facility (other than as an argument passed when the 54 | facility is invoked), then you may convey a copy of the modified 55 | version: 56 | 57 | - a) under this License, provided that you make a good faith effort 58 | to ensure that, in the event an Application does not supply the 59 | function or data, the facility still operates, and performs 60 | whatever part of its purpose remains meaningful, or 61 | - b) under the GNU GPL, with none of the additional permissions of 62 | this License applicable to that copy. 63 | 64 | #### 3. Object Code Incorporating Material from Library Header Files. 65 | 66 | The object code form of an Application may incorporate material from a 67 | header file that is part of the Library. You may convey such object 68 | code under terms of your choice, provided that, if the incorporated 69 | material is not limited to numerical parameters, data structure 70 | layouts and accessors, or small macros, inline functions and templates 71 | (ten or fewer lines in length), you do both of the following: 72 | 73 | - a) Give prominent notice with each copy of the object code that 74 | the Library is used in it and that the Library and its use are 75 | covered by this License. 76 | - b) Accompany the object code with a copy of the GNU GPL and this 77 | license document. 78 | 79 | #### 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, taken 82 | together, effectively do not restrict modification of the portions of 83 | the Library contained in the Combined Work and reverse engineering for 84 | debugging such modifications, if you also do each of the following: 85 | 86 | - a) Give prominent notice with each copy of the Combined Work that 87 | the Library is used in it and that the Library and its use are 88 | covered by this License. 89 | - b) Accompany the Combined Work with a copy of the GNU GPL and this 90 | license document. 91 | - c) For a Combined Work that displays copyright notices during 92 | execution, include the copyright notice for the Library among 93 | these notices, as well as a reference directing the user to the 94 | copies of the GNU GPL and this license document. 95 | - d) Do one of the following: 96 | - 0) Convey the Minimal Corresponding Source under the terms of 97 | this License, and the Corresponding Application Code in a form 98 | suitable for, and under terms that permit, the user to 99 | recombine or relink the Application with a modified version of 100 | the Linked Version to produce a modified Combined Work, in the 101 | manner specified by section 6 of the GNU GPL for conveying 102 | Corresponding Source. 103 | - 1) Use a suitable shared library mechanism for linking with 104 | the Library. A suitable mechanism is one that (a) uses at run 105 | time a copy of the Library already present on the user's 106 | computer system, and (b) will operate properly with a modified 107 | version of the Library that is interface-compatible with the 108 | Linked Version. 109 | - e) Provide Installation Information, but only if you would 110 | otherwise be required to provide such information under section 6 111 | of the GNU GPL, and only to the extent that such information is 112 | necessary to install and execute a modified version of the 113 | Combined Work produced by recombining or relinking the Application 114 | with a modified version of the Linked Version. (If you use option 115 | 4d0, the Installation Information must accompany the Minimal 116 | Corresponding Source and Corresponding Application Code. If you 117 | use option 4d1, you must provide the Installation Information in 118 | the manner specified by section 6 of the GNU GPL for conveying 119 | Corresponding Source.) 120 | 121 | #### 5. Combined Libraries. 122 | 123 | You may place library facilities that are a work based on the Library 124 | side by side in a single library together with other library 125 | facilities that are not Applications and are not covered by this 126 | License, and convey such a combined library under terms of your 127 | choice, if you do both of the following: 128 | 129 | - a) Accompany the combined library with a copy of the same work 130 | based on the Library, uncombined with any other library 131 | facilities, conveyed under the terms of this License. 132 | - b) Give prominent notice with the combined library that part of it 133 | is a work based on the Library, and explaining where to find the 134 | accompanying uncombined form of the same work. 135 | 136 | #### 6. Revised Versions of the GNU Lesser General Public License. 137 | 138 | The Free Software Foundation may publish revised and/or new versions 139 | of the GNU Lesser General Public License from time to time. Such new 140 | versions will be similar in spirit to the present version, but may 141 | differ in detail to address new problems or concerns. 142 | 143 | Each version is given a distinguishing version number. If the Library 144 | as you received it specifies that a certain numbered version of the 145 | GNU Lesser General Public License "or any later version" applies to 146 | it, you have the option of following the terms and conditions either 147 | of that published version or of any later version published by the 148 | Free Software Foundation. If the Library as you received it does not 149 | specify a version number of the GNU Lesser General Public License, you 150 | may choose any version of the GNU Lesser General Public License ever 151 | published by the Free Software Foundation. 152 | 153 | If the Library as you received it specifies that a proxy can decide 154 | whether future versions of the GNU Lesser General Public License shall 155 | apply, that proxy's public statement of acceptance of any version is 156 | permanent authorization for you to choose that version for the 157 | Library. -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:1 2 | FROM continuumio/miniconda3 3 | 4 | # Install conda environment 5 | RUN conda create -n doegen_app python=3.7 6 | 7 | # Activate conda environment and install swig and numpy 8 | ENV PATH /opt/conda/envs/doegen_app/bin:$PATH 9 | RUN /bin/bash -c ". activate doegen_app" && \ 10 | conda install --yes swig 11 | 12 | # Install DoEgen (incl OApackage) from PyPi 13 | # Numpy and gcc must be explicitely installed before installing OApackage 14 | RUN apt-get update && apt-get install -y g++ 15 | RUN pip install numpy 16 | # DoEgen install might take a few minutes since OApackage wheel building very long 17 | RUN pip install DoEgen 18 | 19 | WORKDIR project/ 20 | ENTRYPOINT ["python", "-m"] 21 | 22 | # Give default arguments, in case none are supplied on 23 | # the command-line, e.g. 24 | CMD ["doegen.init_tests"] 25 | 26 | 27 | #HOW TO BUILD (IN SHELL), e.g.: 28 | #docker build -t doegen-app:v1 . 29 | 30 | #HOW TO RUN (IN SHELL), e.g.: 31 | #docker run -it -v /project:/project doegen-app:v1 doegen.doegen .yaml 32 | #docker run -it -v /project:/project doegen-app:v1 doegen.doeval .yaml 33 | 34 | -------------------------------------------------------------------------------- /docker/project/test/Experiment_setup_test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/Experiment_setup_test.xls -------------------------------------------------------------------------------- /docker/project/test/Experiment_setup_test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/Experiment_setup_test.xlsx -------------------------------------------------------------------------------- /docker/project/test/results/Designtable_optimal_Nrun72.csv: -------------------------------------------------------------------------------- 1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8 2 | 1,0,-10,1,1,L1,0,0,L1 3 | 2,3,-3,3,1,L2,0,1,L2 4 | 3,0,-10,3,3,L3,0,1,L2 5 | 4,6,-10,3,5,L3,1,1,L2 6 | 5,0,4,3,1,L3,0,0,L1 7 | 6,3,-3,5,1,L3,2,0,L2 8 | 7,0,-3,3,3,L2,2,1,L2 9 | 8,6,4,1,1,L1,2,1,L2 10 | 9,3,4,5,3,L2,1,1,L1 11 | 10,6,4,5,5,L1,0,0,L2 12 | 11,3,4,1,1,L2,0,0,L2 13 | 12,6,-3,1,5,L2,2,0,L1 14 | 13,6,-3,3,1,L1,0,0,L1 15 | 14,6,-10,1,1,L2,2,1,L1 16 | 15,3,-10,1,5,L1,1,1,L1 17 | 16,6,-3,1,3,L1,0,1,L1 18 | 17,3,-3,1,3,L3,2,0,L2 19 | 18,0,-3,1,5,L1,1,0,L2 20 | 19,0,-3,3,5,L2,0,1,L1 21 | 20,0,-3,1,5,L3,1,0,L2 22 | 21,6,-3,5,1,L2,1,0,L2 23 | 22,6,-3,3,3,L3,2,0,L1 24 | 23,6,-10,1,3,L2,0,1,L2 25 | 24,6,4,3,1,L2,1,0,L2 26 | 25,6,4,5,3,L1,2,1,L2 27 | 26,6,-10,3,3,L2,2,1,L2 28 | 27,0,4,3,1,L1,1,1,L1 29 | 28,6,-10,5,5,L3,0,0,L2 30 | 29,6,4,1,5,L3,2,0,L1 31 | 30,6,-3,5,5,L2,1,1,L1 32 | 31,0,4,5,3,L2,0,0,L1 33 | 32,0,-3,3,5,L1,2,1,L2 34 | 33,0,4,1,5,L3,1,1,L2 35 | 34,0,4,5,5,L2,2,0,L2 36 | 35,6,-10,5,5,L1,2,1,L1 37 | 36,3,-10,5,1,L3,1,1,L2 38 | 37,0,-10,5,3,L1,1,0,L1 39 | 38,3,-10,5,1,L3,2,0,L1 40 | 39,3,4,5,3,L1,2,1,L1 41 | 40,3,-10,5,1,L1,1,0,L2 42 | 41,3,-10,3,5,L3,1,0,L1 43 | 42,0,-10,3,1,L2,2,0,L1 44 | 43,0,-3,1,1,L1,1,0,L2 45 | 44,6,4,5,3,L3,0,0,L2 46 | 45,3,-10,3,3,L1,2,0,L2 47 | 46,3,-3,3,3,L2,1,1,L1 48 | 47,3,4,1,5,L2,2,0,L1 49 | 48,6,4,3,3,L1,1,0,L1 50 | 49,3,-3,5,5,L3,0,1,L1 51 | 50,3,-3,1,1,L1,0,1,L1 52 | 51,0,4,5,1,L3,0,1,L1 53 | 52,0,4,1,3,L3,1,1,L2 54 | 53,3,4,3,5,L2,0,0,L2 55 | 54,0,-3,5,3,L1,0,0,L1 56 | 55,6,-3,3,3,L3,1,0,L2 57 | 56,3,-3,1,3,L3,0,1,L1 58 | 57,0,4,3,1,L3,2,1,L1 59 | 58,3,4,1,5,L2,0,1,L2 60 | 59,3,4,3,5,L1,2,0,L1 61 | 60,0,-10,5,1,L2,0,1,L2 62 | 61,3,-10,3,5,L1,0,0,L2 63 | 62,6,-10,3,5,L3,0,1,L1 64 | 63,3,4,3,3,L1,1,1,L2 65 | 64,3,-10,1,3,L2,1,0,L1 66 | 65,3,-3,5,1,L3,2,1,L2 67 | 66,6,-3,5,1,L2,1,0,L1 68 | 67,0,-3,5,5,L1,2,1,L2 69 | 68,6,-10,1,1,L1,0,1,L2 70 | 69,6,4,1,1,L3,1,1,L1 71 | 70,0,-10,1,3,L2,2,0,L2 72 | 71,0,-10,5,5,L2,1,1,L1 73 | 72,0,-10,1,3,L3,2,0,L1 74 | -------------------------------------------------------------------------------- /docker/project/test/results/experiment_results_Nrun72.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/results/experiment_results_Nrun72.xls -------------------------------------------------------------------------------- /docker/project/test/results/experiment_results_Nrun72.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/results/experiment_results_Nrun72.xlsx -------------------------------------------------------------------------------- /docker/project/test/settings_design_test.yaml: -------------------------------------------------------------------------------- 1 | # Settings for Experiment Design Generation 2 | 3 | # Path to exp design setup file 4 | path: 'test/' 5 | # Set path for output files. If empty (''), output folder will be same as above for setup file 6 | outpath: 'test/results_test/' 7 | # Filename for exp setup file 8 | fname_setup: 'Experiment_setup_test.xlsx' 9 | # Maximum number of experimental runs: 10 | nrun_max: 20 11 | 12 | # Set maximum time for optimization per run (in seconds, recommended to set to at least ~100s) 13 | maxtime_per_run: 80 14 | 15 | # Set maximal stepsize of run size interval, so tyhat noty each runszie has teo be optimized 16 | # The larger the interval, the faster the total computation 17 | # by default (select delta_nrun = None) it will select automatically the interval step with the lowest common multiple of the levels 18 | # (e.g. if mix between level 2 and 3 thus will results in delta_nrun = 6) 19 | delta_nrun: None -------------------------------------------------------------------------------- /docker/project/test/settings_expresults_test.yaml: -------------------------------------------------------------------------------- 1 | # Settings for Experiment Design Generation 2 | 3 | # Path to exp design setup file 4 | inpath: 'test/results/' 5 | # Set path for output files. Ff '', output folder will be same as above for setup file 6 | outpath: 'test/doeval_results/' 7 | # Filename for exp design table in inpath 8 | fname_design: 'Designtable_optimal_Nrun72.csv' 9 | # Filename for exp results in inpath: 10 | fname_results: 'experiment_results_Nrun72.xlsx' -------------------------------------------------------------------------------- /docs/DoEgen_explained/01_experiment_setup_definition_.md: -------------------------------------------------------------------------------- 1 | # Chapter 1: Experiment Setup Definition 2 | 3 | Welcome to the `DoEgen` tutorial! Before we can start designing clever experiments, we first need to clearly tell `DoEgen` *what* we want to test. This chapter is all about defining the structure and ingredients of your experiment. 4 | 5 | ## What's the Big Idea? Planning Your Experiment Recipe 6 | 7 | Imagine you want to bake the perfect cake. You wouldn't just randomly throw ingredients into a bowl! You'd start with a recipe that lists: 8 | * **Ingredients:** Flour, Sugar, Eggs, Temperature, Baking Time... 9 | * **Amounts/Settings:** 2 cups of Flour, 1 cup of Sugar, Bake at 180°C for 30 minutes... 10 | * **Types:** White Flour vs. Whole Wheat, Granulated Sugar vs. Brown Sugar... 11 | 12 | Defining an **Experiment Setup** in `DoEgen` is exactly like creating this recipe list for your experiment. It tells `DoEgen` precisely: 13 | * What factors (variables or "ingredients") you want to change and study. 14 | * What different values or types (levels) each factor can have. 15 | * Whether a factor is a number (like temperature) or a category (like material type). 16 | 17 | This definition is the absolute foundation for everything that follows. Without a clear setup, `DoEgen` can't generate an efficient plan for your experiments. 18 | 19 | ## Key Ingredients of an Experiment Setup 20 | 21 | Let's break down the components you need to define: 22 | 23 | 1. **Factors:** These are the variables you control and want to investigate in your experiment. 24 | * *Analogy:* In our cake example, `Temperature`, `Sugar Amount`, and `Flour Type` are factors. 25 | * *Example:* If studying plant growth, factors might be `Water Amount`, `Sunlight Hours`, `Fertilizer Type`. 26 | 27 | 2. **Factor Types:** Factors can be different kinds: 28 | * **Numeric:** Represented by numbers. 29 | * **Continuous:** Can take any value within a range (e.g., Temperature: 20.5°C, 31.2°C, etc.). 30 | * **Discrete:** Can only take specific numeric values, often whole numbers (e.g., Number of Seeds planted: 1, 2, 3). 31 | * **Categorical:** Represented by distinct categories or labels, not numbers (e.g., Fertilizer Type: 'Brand A', 'Brand B', 'Organic'). 32 | 33 | 3. **Levels:** These are the specific values or settings you choose to test for each factor. 34 | * *Analogy:* For the `Temperature` factor, you might test 3 levels: 170°C, 180°C, 190°C. For `Flour Type`, you might test 2 levels: 'White', 'Whole Wheat'. 35 | * The **number of levels** tells `DoEgen` how many different settings you want to examine for that factor. More levels allow for more detailed analysis but usually require more experiments. 36 | 37 | 4. **Ranges / Specific Values:** How you define the levels depends on the factor type: 38 | * **Numeric:** You typically provide a `Minimum` and `Maximum` value. `DoEgen` then calculates evenly spaced levels based on the `Level Number` you specified. For example, if Temperature is Continuous, has 3 Levels, Min=20, Max=40, the levels might be 20, 30, 40. 39 | * **Categorical:** You usually list the exact names of the categories (levels). For example, if Fertilizer Type has 2 levels, you might specify them as 'Brand A', 'Brand B'. 40 | 41 | ## The Experiment Setup Excel Template 42 | 43 | The easiest way to give `DoEgen` this "recipe" is by filling out a simple Excel spreadsheet. `DoEgen` provides template files to get you started. 44 | 45 | You can create a blank template by running a helper script included with `DoEgen`: 46 | 47 | ```bash 48 | # Run this command in your terminal in the DoEgen project directory 49 | python -m doegen.create_setupfile 50 | # Or for an extended version with more options: 51 | # python -m doegen.create_setupfile_extended 52 | ``` 53 | 54 | This creates an Excel file (like `Experiment_setup_template.xlsx`) with the necessary columns. Here's what the main columns mean: 55 | 56 | | Column Header | Description | Example | 57 | | :--------------- | :---------------------------------------------------------------------------------------------------------------------------------------- | :---------------- | 58 | | `Parameter Name` | The name of your factor. | `Temperature` | 59 | | `Parameter Type` | The type of factor: `Continuous`, `Discrete`, or `Categorical`. | `Continuous` | 60 | | `Level Number` | How many different values/settings you want to test for this factor. | `3` | 61 | | `Minimum` | The lowest value for Numeric factors. (Leave blank for Categorical). | `20` | 62 | | `Maximum` | The highest value for Numeric factors. (Leave blank for Categorical). | `40` | 63 | | `Levels` (Optional) | For Categorical factors, list the exact level names separated by commas. Can also be used for specific numeric levels. | `Brand A, Brand B` | 64 | | `Include (Y/N)` (Optional) | Set to 'No' if you want to list a factor but *not* vary it in this specific design. Defaults to 'Yes'. | `Yes` | 65 | 66 | Here's how you might fill it out for a simple experiment: 67 | 68 | *(Based on the image from `MANUAL.md`)* 69 | ![Experiment Setup Table Header.](https://github.com/sebhaan/DoEgen/blob/main/figures/Setup_header.png){width=600} 70 | 71 | **Example Fill-out:** 72 | 73 | | Parameter Name | Parameter Type | Level Number | Minimum | Maximum | Levels | 74 | | :------------- | :------------- | :----------- | :------ | :------ | :--------------------- | 75 | | Temperature | Continuous | 3 | 20 | 40 | | 76 | | Pressure | Discrete | 2 | 1 | 5 | | 77 | | Catalyst | Categorical | 2 | | | Catalyst X, Catalyst Y | 78 | | Speed | Continuous | 3 | 100 | 300 | | 79 | 80 | This table clearly tells `DoEgen`: 81 | * We have 4 factors: Temperature, Pressure, Catalyst, Speed. 82 | * Temperature is continuous, tested at 3 levels between 20 and 40. 83 | * Pressure is discrete, tested at 2 levels between 1 and 5. 84 | * Catalyst is categorical, tested with 'Catalyst X' and 'Catalyst Y'. 85 | * Speed is continuous, tested at 3 levels between 100 and 300. 86 | 87 | ## How `DoEgen` Reads Your Recipe (Simplified View) 88 | 89 | Under the hood, `DoEgen` uses Python code to read this Excel file and understand your experimental setup. It primarily uses the `pandas` library to handle the spreadsheet data. 90 | 91 | Here's a very simplified Python snippet illustrating the core idea (the actual code in `doegen/doegen.py` within the `read_setup_new` function is more detailed): 92 | 93 | ```python 94 | # Simplified view of how DoEgen reads the setup file (doegen/doegen.py) 95 | import pandas as pd 96 | 97 | def read_setup_simplified(fname_setup): 98 | """Reads the Excel setup file and extracts factor information.""" 99 | try: 100 | # Use pandas library to read the Excel file into a table (DataFrame) 101 | df = pd.read_excel(fname_setup) 102 | print(f"Successfully read setup file: {fname_setup}") 103 | 104 | # --- Extract Basic Info --- 105 | # Get lists of names, types, levels, etc. from the table columns 106 | factor_names = df["Parameter Name"].tolist() 107 | level_numbers = df["Level Number"].tolist() 108 | # ... extract other columns like Parameter Type, Min, Max, Levels ... 109 | 110 | # --- Determine Specific Level Values (Simplified Logic) --- 111 | # (Actual code calculates numeric levels based on min/max/count 112 | # and parses categorical levels from the 'Levels' column) 113 | # level_values = calculate_actual_levels(df) # Placeholder 114 | 115 | print(f"Found {len(factor_names)} factors to include:") 116 | print(f" Names: {factor_names}") 117 | print(f" Levels per factor: {level_numbers}") 118 | 119 | # Store this information in a structured way (like the ExperimentalSetup object) 120 | # setup_object = create_setup_object(level_numbers, level_values, factor_names) 121 | # return setup_object # Return the processed setup info 122 | 123 | except FileNotFoundError: 124 | print(f"Error: Setup file not found at {fname_setup}") 125 | except Exception as e: 126 | print(f"Error reading setup file: {e}") 127 | 128 | # Example of how DoEgen might use this function internally: 129 | # experiment_setup = read_setup_simplified("Experiment_setup_template.xlsx") 130 | # if experiment_setup: 131 | # # Now use experiment_setup for Design Generation... 132 | # pass 133 | ``` 134 | 135 | This code essentially: 136 | 1. Opens and reads the Excel file specified. 137 | 2. Pulls out the information from each column (Parameter Name, Level Number, etc.). 138 | 3. Processes this raw information to figure out the exact level values for each factor (e.g., calculating `[20, 30, 40]` for Temperature). 139 | 4. Packages this structured information neatly so other parts of `DoEgen` can use it. 140 | 141 | ## The Process Flow 142 | 143 | Here's a simple diagram showing how your Excel file becomes the setup definition inside `DoEgen`: 144 | 145 | ```mermaid 146 | sequenceDiagram 147 | participant U as User 148 | participant DG as DoEgen (Main Script) 149 | participant RSN as read_setup_new() Function 150 | participant P as Pandas Library 151 | participant ESO as ExperimentSetup Object 152 | 153 | U->>DG: Specifies path to 'Experiment_setup.xlsx' 154 | DG->>RSN: Calls read_setup_new() with the path 155 | RSN->>P: Asks Pandas to read the Excel file 156 | P-->>RSN: Returns the data as a table (DataFrame) 157 | RSN->>RSN: Extracts columns (Names, Types, Levels, Min, Max...) 158 | RSN->>RSN: Calculates specific level values (e.g., [20, 30, 40]) 159 | RSN->>ESO: Creates an 'ExperimentSetup' object containing all processed info 160 | ESO-->>RSN: Returns the created object 161 | RSN-->>DG: Returns the completed 'ExperimentSetup' object 162 | Note right of DG: DoEgen now has the structured recipe! 163 | ``` 164 | 165 | ## Conclusion 166 | 167 | In this chapter, we learned the fundamental concept of the **Experiment Setup Definition**. It's the crucial first step where you precisely define the "ingredients" (factors), their "types" (numeric/categorical), and the specific "settings" (levels) you want to test in your experiment. We saw how to provide this information using a structured Excel template. 168 | 169 | This setup definition acts as the blueprint or recipe that `DoEgen` needs. With this information clearly defined, we are now ready to move on to the next exciting step: actually creating the experimental plan. 170 | 171 | Let's dive into [Chapter 2: Design Generation 172 | ](02_design_generation_.md) to see how `DoEgen` uses this setup to build an efficient experiment schedule! 173 | 174 | --- 175 | 176 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/02_design_generation_.md: -------------------------------------------------------------------------------- 1 | # Chapter 2: Design Generation 2 | 3 | In [Chapter 1: Experiment Setup Definition 4 | ](01_experiment_setup_definition_.md), we learned how to create the "recipe" for our experiment by defining the factors, levels, and types. Now that `DoEgen` knows *what* we want to test, this chapter focuses on *how* to plan the actual sequence of experiments efficiently. This core process is called **Design Generation**. 5 | 6 | ## The Challenge: Too Many Experiments! 7 | 8 | Imagine you want to test different settings for baking that perfect cake from Chapter 1. Let's say you have: 9 | * Temperature: 3 levels (170°C, 180°C, 190°C) 10 | * Sugar Amount: 3 levels (0.8 cup, 1 cup, 1.2 cups) 11 | * Flour Type: 2 levels (White, Whole Wheat) 12 | * Baking Time: 3 levels (25 min, 30 min, 35 min) 13 | 14 | If you wanted to test *every single possible combination* (a "full factorial" design), you'd need to bake: 15 | `3 (Temp) * 3 (Sugar) * 2 (Flour) * 3 (Time) = 54` cakes! 16 | 17 | That's a lot of baking! For more complex experiments with more factors or levels, the number of combinations explodes quickly. This is where **Design Generation** comes in. 18 | 19 | ## The Solution: A Smart Scheduler for Experiments 20 | 21 | **Design Generation** in `DoEgen` is like using a **smart scheduler** to plan your experiments. Instead of running every single combination, it intelligently selects a much smaller, representative set of runs. 22 | 23 | Think about test-driving cars. You want to evaluate different features (engine type, transmission, color, trim level). You *could* test drive every single possible configuration, but that would take forever! A smart scheduler would help you pick the minimum number of diverse test drives needed to get a good feel for all the important features and how they might interact, without driving hundreds of cars. 24 | 25 | `DoEgen` aims to create designs (experimental plans) that are: 26 | 1. **Efficient:** Uses the minimum number of runs possible to get meaningful results. 27 | 2. **Balanced:** Tests each level of each factor roughly the same number of times. (Like making sure you test both 'White' and 'Whole Wheat' flour fairly). 28 | 3. **Near-Orthogonal:** Tries to ensure factors can be evaluated independently. (Ideally, changing the 'Temperature' shouldn't automatically force a change in 'Baking Time' in your plan). 29 | 30 | To achieve this, `DoEgen` cleverly uses the `OApackage` library, which specializes in finding these kinds of optimized experimental plans, often based on mathematical structures called Orthogonal Arrays. 31 | 32 | ## How to Generate a Design with `DoEgen` 33 | 34 | Generating a design involves two main things: 35 | 1. Your **Experiment Setup file** (the Excel file we created in Chapter 1). 36 | 2. A **Settings file** (usually `settings_design.yaml`) that tells `DoEgen` things like: 37 | * Where to find your setup file. 38 | * How many experimental runs you're willing to do (e.g., a minimum and maximum number). 39 | * Where to save the generated designs. 40 | * How much computer time to spend searching for the best design. (We'll cover settings files in detail in [Chapter 8: Configuration Handling 41 | ](08_configuration_handling_.md)). 42 | 43 | Once you have these ready, you run `DoEgen` from your terminal: 44 | 45 | ```bash 46 | # Make sure your setup file (e.g., Experiment_setup.xlsx) is ready 47 | # Make sure your settings file (e.g., settings_design.yaml) is configured 48 | 49 | # Run the design generation module 50 | python -m doegen.doegen settings_design.yaml 51 | ``` 52 | 53 | **What does this command do?** 54 | * It tells Python to run the `doegen` module within the `doegen` package. 55 | * It passes the `settings_design.yaml` file as input, which tells `DoEgen` all the specifics for this run. 56 | 57 | **What happens next?** 58 | `DoEgen` will: 59 | 1. Read your setup file (`Experiment_setup.xlsx`). 60 | 2. Read your settings file (`settings_design.yaml`). 61 | 3. Figure out the range of run numbers to explore (e.g., from 12 runs up to 150 runs, in steps of 6, based on your settings). 62 | 4. For *each* run number in that range, it will use `OApackage` to search for an optimized design (a sequence of experiments). This can take some time, especially for larger designs. 63 | 5. It saves each generated design as a simple table (CSV file). 64 | 6. It also calculates some quality scores (efficiencies) for each design and saves those too. (More on this in [Chapter 3: Design Evaluation & Efficiency Metrics 65 | ](03_design_evaluation___efficiency_metrics_.md)). 66 | 67 | **Example Output (Simplified Design Array):** 68 | 69 | After running, `DoEgen` might create files like `EDarray_[3, 3, 2, 3]_Nrun54.csv` in an output folder. This file contains the actual plan. It looks like a table where each row is one experiment run, and each column is a factor: 70 | 71 | ```csv 72 | # Simplified Example: EDarray_..._Nrun12.csv (A design with 12 runs) 73 | # Note: Actual arrays use numbers (0, 1, 2...) to represent levels internally. 74 | # DoEgen later converts these back to your actual values (e.g., 170C, 180C). 75 | 76 | # Factor Levels: Temp(3), Sugar(3), Flour(2), Time(3) 77 | # Levels represented as 0, 1, 2... 78 | 79 | 0,0,0,0 # Run 1: Temp=Level 0, Sugar=Level 0, Flour=Level 0, Time=Level 0 80 | 1,1,0,1 # Run 2: Temp=Level 1, Sugar=Level 1, Flour=Level 0, Time=Level 1 81 | 2,2,0,2 # Run 3: Temp=Level 2, Sugar=Level 2, Flour=Level 0, Time=Level 2 82 | 0,1,1,2 # Run 4: Temp=Level 0, Sugar=Level 1, Flour=Level 1, Time=Level 2 83 | 1,2,1,0 # Run 5: Temp=Level 1, Sugar=Level 2, Flour=Level 1, Time=Level 0 84 | 2,0,1,1 # Run 6: Temp=Level 2, Sugar=Level 0, Flour=Level 1, Time=Level 1 85 | 0,2,0,1 # Run 7: ... and so on ... 86 | 1,0,0,2 87 | 2,1,0,0 88 | 0,0,1,0 89 | 1,1,1,1 90 | 2,2,1,2 91 | ``` 92 | 93 | This table is the core output – your optimized schedule of experiments! 94 | 95 | ## What's Happening Under the Hood? 96 | 97 | Let's peek inside `DoEgen` to see the main steps when you run the design generation command: 98 | 99 | 1. **Read Inputs:** `DoEgen` first reads your experiment recipe from the Excel setup file ([Chapter 1: Experiment Setup Definition 100 | ](01_experiment_setup_definition_.md)) and the run parameters from the `settings_design.yaml` file ([Chapter 8: Configuration Handling 101 | ](08_configuration_handling_.md)). 102 | 2. **Determine Run Sizes:** It calculates the range of experiment sizes (number of runs) to investigate, based on your settings (e.g., minimum runs, maximum runs, step size). 103 | 3. **Optimize for Each Size:** For each number of runs (e.g., 12 runs, 18 runs, 24 runs...), it calls the core optimization function. 104 | * This function (`optimize_design`) uses the `OApackage` library. `OApackage` is the "engine" that searches for balanced, near-orthogonal designs with the specified number of runs and factor levels. It tries many possibilities and selects the best ones it finds within the allowed time. 105 | 4. **Evaluate Design Quality:** After `OApackage` proposes a design, `DoEgen` calculates various quality metrics (like balance, orthogonality, D-efficiency) using its `evaluate_design2` function. We'll learn about these metrics in [Chapter 3: Design Evaluation & Efficiency Metrics 106 | ](03_design_evaluation___efficiency_metrics_.md). 107 | 5. **Save Results:** The best design found for that specific run size (e.g., the best 12-run design) and its associated quality scores are saved to files. 108 | 6. **Repeat:** Steps 3-5 are repeated for all the different run sizes you asked `DoEgen` to explore. 109 | 7. **Select & Summarize:** Finally, `DoEgen` analyzes the quality scores across all generated designs and suggests a few "good" options (minimum, optimal, best) based on predefined criteria. This is covered in [Chapter 4: Design Selection 110 | ](04_design_selection_.md). 111 | 112 | Here's a simplified view of the process: 113 | 114 | ```mermaid 115 | sequenceDiagram 116 | participant U as User 117 | participant DG as doegen.py (Main Script) 118 | participant OD as optimize_design() Function 119 | participant OAP as OApackage Library 120 | participant ED as evaluate_design2() Function 121 | participant Files as Output Files 122 | 123 | U->>DG: Runs `python -m doegen.doegen settings.yaml` 124 | DG->>DG: Reads Setup & Settings 125 | DG->>OD: Calls optimize_design() for Run Size N 126 | Note right of OD: Tries to find best N-run design 127 | OD->>OAP: Asks OApackage to generate candidate designs 128 | OAP-->>OD: Returns potential design(s) 129 | OD->>ED: Asks evaluate_design2() to score the design 130 | ED-->>OD: Returns quality metrics (efficiencies) 131 | OD-->>DG: Returns best design found for size N & its scores 132 | DG->>Files: Saves Design Array (CSV) & Efficiencies (CSV) 133 | Note right of DG: Repeats for other run sizes... 134 | Note right of DG: Finally, suggests best designs (Ch 4) 135 | 136 | ``` 137 | 138 | ## Diving Deeper into the Code (Simplified View) 139 | 140 | The main script `doegen/doegen.py` orchestrates this process. 141 | 142 | 1. **Reading Setup:** It uses functions like `read_setup_new` (which we saw in Chapter 1) to load your experiment definition. 143 | 144 | 2. **Looping and Optimizing:** The `main` function sets up a loop (or uses multiprocessing via `optimize_design_multi`) to iterate through the desired run sizes (e.g., `nrun_min` to `nrun_max` in steps of `ndelta`). Inside this loop, it calls `optimize_design` for each run size. 145 | 146 | ```python 147 | # Simplified view from doegen/doegen.py - main function logic 148 | 149 | def main(fname_setup, outpath, nrun_max, maxtime_per_run, delta_nrun, nrun_min): 150 | # 1. Read the experiment setup 151 | setup = ExperimentalSetup.read(fname_setup) 152 | print(f"Read setup for {setup.number_of_factors} factors.") 153 | 154 | # 2. Determine the range of run sizes to test 155 | # (Calculates nrun_min if not given, determines step size ndelta) 156 | ndelta = delta_nrun # Simplified 157 | # ... calculation of actual nrun_min ... 158 | xrun = np.arange(nrun_min, nrun_max, ndelta) # e.g., [12, 18, 24, ...] 159 | print(f"Will generate designs for run sizes: {xrun}") 160 | 161 | # 3. Optimize for each run size (potentially in parallel) 162 | all_efficiencies = [] 163 | for runsize in xrun: 164 | print(f"--- Optimizing for {runsize} runs ---") 165 | # Call the core optimization function 166 | effs = optimize_design(setup, outpath, maxtime_per_run, ndelta, runsize) 167 | all_efficiencies.append(effs) 168 | # (Actual code might use optimize_design_multi for parallelism) 169 | 170 | # 4. Process results, save summary, suggest designs (See Ch 3 & 4) 171 | # ... save combined efficiencies ... 172 | # ... select minimum, optimal, best designs ... 173 | print("FINISHED Design Generation") 174 | ``` 175 | 176 | 3. **Core Optimization (`optimize_design`):** This function is the heart of the generation process. It prepares the inputs for `OApackage` and calls its optimization routine. 177 | 178 | ```python 179 | # Simplified view from doegen/doegen.py - optimize_design function 180 | 181 | import oapackage # The core library for finding designs 182 | 183 | def optimize_design(setup, outpath, runtime, delta, runsize, printopt=True): 184 | """Optimizes design for a specific runsize.""" 185 | print(f"Searching for best design with {runsize} runs...") 186 | outpath_nrun = os.path.join(outpath, f"DesignArray_Nrun{runsize}/") 187 | 188 | # Define the problem for OApackage 189 | arrayclass = oapackage.arraydata_t( 190 | setup.factor_levels, runsize, 0, setup.number_of_factors 191 | ) 192 | 193 | # Ask OApackage to find good designs (this is the complex part!) 194 | # It tries many random starts and improvements. 195 | # 'alpha' weights different optimization criteria. 196 | alpha = [5, 5, 15] # Example weights 197 | # 'niter' relates to how long it searches (calculated based on 'runtime') 198 | niter = calculate_iterations_based_on_time(runtime) # Simplified placeholder 199 | 200 | scores, _, designs, _ = oapackage.Doptim.Doptimize( 201 | arrayclass, nrestarts=10, niter=niter, optimfunc=alpha, maxtime=runtime 202 | ) 203 | print(f"OApackage generated {len(designs)} candidate designs.") 204 | 205 | # Select the best design found by OApackage based on DoEgen's criteria 206 | # (Uses evaluate_design2 to score them - see Ch 3) 207 | best_design_array = find_best_among_candidates(setup, designs) # Simplified 208 | 209 | # Evaluate the final selected design 210 | efficiencies = evaluate_design2(setup, best_design_array, dir_out=outpath_nrun) 211 | 212 | # Save the best design array and its efficiencies 213 | save_design_and_efficiencies(outpath_nrun, setup, runsize, best_design_array, efficiencies) 214 | 215 | return efficiencies # Return the scores for this runsize 216 | ``` 217 | 218 | This simplified code shows how `DoEgen` acts as a manager: it sets up the problem, calls the specialized `OApackage` engine to do the heavy lifting of finding candidate designs, evaluates the results using its own criteria ([Chapter 3: Design Evaluation & Efficiency Metrics 219 | ](03_design_evaluation___efficiency_metrics_.md)), and saves the final plan. 220 | 221 | ## Conclusion 222 | 223 | In this chapter, we explored **Design Generation**, the core process where `DoEgen` creates an efficient experimental plan. We learned that instead of testing every single combination (which is often impractical), `DoEgen` acts like a "smart scheduler", using the `OApackage` library to find a smaller set of experiments that are **balanced** and **near-orthogonal**. 224 | 225 | We saw how to initiate this process using the `python -m doegen.doegen` command with a settings file, and what kind of output files (the design arrays) to expect. We also got a glimpse into the internal steps `DoEgen` takes to optimize and evaluate these designs. 226 | 227 | Now that we have generated potential experimental plans, the next crucial step is to understand how "good" these plans actually are. Let's move on to [Chapter 3: Design Evaluation & Efficiency Metrics 228 | ](03_design_evaluation___efficiency_metrics_.md) to learn how `DoEgen` measures the quality of the generated designs. 229 | 230 | --- 231 | 232 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/03_design_evaluation___efficiency_metrics_.md: -------------------------------------------------------------------------------- 1 | # Chapter 3: Design Evaluation & Efficiency Metrics 2 | 3 | In [Chapter 2: Design Generation](02_design_generation_.md), we saw how `DoEgen` acts like a smart scheduler to create potential experimental plans (designs) for different numbers of runs. But how do we know if a generated plan is actually *good*? Just because we have a schedule doesn't mean it's efficient or useful. This chapter explains how `DoEgen` grades these plans using **Design Evaluation & Efficiency Metrics**. 4 | 5 | ## Why Grade Our Experiment Plan? 6 | 7 | Imagine you created several potential test drive schedules (from Chapter 2) for evaluating new cars. Some schedules might have you driving only on highways, others only in the city. Some might make you test the red car way more often than the blue car. Clearly, not all schedules are equally helpful! 8 | 9 | We need a way to **grade** these schedules based on specific criteria: 10 | * Does it test all the features (factors) fairly? 11 | * Does it cover different driving conditions (levels) evenly? 12 | * Does it avoid redundant tests or confusing situations where changing one feature always changes another? 13 | 14 | **Design Evaluation** in `DoEgen` does exactly this for our experimental plans. It assigns numerical scores (metrics) to quantify how well-designed a plan is. This helps us compare different plans (e.g., a 12-run plan vs. an 18-run plan) and choose the one that gives us the most reliable information for the number of experiments we can afford. 15 | 16 | ## Key Grading Criteria: The Efficiency Metrics 17 | 18 | `DoEgen` uses several statistical metrics to "grade" a design. Think of these like different subjects on a report card for your experimental plan. The scores usually range from 0 (worst) to 100 (best or theoretically optimal). Here are the main ones: 19 | 20 | 1. **Level Balance:** 21 | * **Question:** How evenly does the plan use each setting (level) for every factor? 22 | * **Analogy:** In our cake baking plan, does it use 170°C, 180°C, and 190°C roughly the same number of times? Does it test 'White' flour about as often as 'Whole Wheat'? 23 | * **Why it matters:** Ensures fair comparison of all levels. A low score means some settings are under-represented. 24 | * **Score:** 0-100. 100 means perfect balance (each level appears exactly the same number of times, or as close as possible). 25 | 26 | 2. **Orthogonality:** 27 | * **Question:** How independent are the factors in the plan? Can we change one factor's setting without being forced to change another? 28 | * **Analogy:** In the test drive schedule, if every time we test the 'Sport' engine, we *also* have to test the 'Manual' transmission, it's hard to tell if good performance is due to the engine or the transmission. An orthogonal plan avoids this forced pairing. 29 | * **Why it matters:** Allows us to estimate the effect of each factor separately without confusion. Low orthogonality (high correlation between factor columns in the plan) makes analysis difficult. 30 | * **Score:** 0-100. 100 means perfectly orthogonal (all factors are statistically independent in the design). 31 | 32 | 3. **D-Efficiency (D1-Eff, D-Eff, D2-Eff):** 33 | * **Question:** How precisely can we estimate the effects of the factors based on this plan? 34 | * **Analogy:** Think of this as the "sharpness" or "focus" of the picture we'll get about how each factor influences the outcome. A higher D-efficiency means a sharper picture. 35 | * **Why it matters:** A more D-efficient design allows for more accurate conclusions about which factors are important and by how much. 36 | * **Details:** 37 | * `D1-Eff`: Considers only the main effects of each factor (most commonly used for initial screening). 38 | * `D-Eff`: Considers main effects *and* quadratic effects (e.g., effect of Temperature and Temperature-squared). 39 | * `D2-Eff`: Considers main, quadratic, *and* two-way interaction effects (e.g., how Temperature and Sugar Amount interact). 40 | * **Score:** 0-100. 100 is a theoretical maximum. Higher is better, especially for `D1-Eff` in smaller designs. 41 | 42 | 4. **Interaction Balance (Two-way Interaction Balance):** 43 | * **Question:** How evenly does the plan test combinations of *pairs* of factor levels? 44 | * **Analogy:** Does our cake plan test 'White' flour combined with 170°C, 180°C, and 190°C? Does it also test 'Whole Wheat' flour with all three temperatures? Does it do this fairly for *all pairs* of factors? 45 | * **Why it matters:** Helps understand if the effect of one factor changes depending on the level of another factor (interactions). 46 | * **Score:** 0-100. 100 means all pairs of levels across all pairs of factors are tested equally often (or as close as possible). 47 | * **Related Metric:** *Two-way Interaction with at least one occurrence*: Checks if *every* possible pair combination appears at least once. Score 100 means yes, lower means some combinations are missing entirely. 48 | 49 | `DoEgen` also calculates other metrics like **Center Balance** (related to how well centered the design is for numeric factors) and **A-Efficiencies** (another measure related to estimation precision), giving a comprehensive evaluation. 50 | 51 | ## How `DoEgen` Performs the Evaluation 52 | 53 | The good news is: you don't usually need to run a separate command for evaluation! When you run the [Design Generation](02_design_generation_.md) process (`python -m doegen.doegen settings_design.yaml`), `DoEgen` automatically evaluates *every* design it generates for each run size. 54 | 55 | Inside `DoEgen`, a function called `evaluate_design2` (located in the `doegen/doegen.py` file) takes the generated design array (the table of 0s, 1s, 2s...) and calculates all these efficiency scores. 56 | 57 | ## Understanding the Evaluation Results 58 | 59 | After running the design generation, `DoEgen` saves the results of the evaluation in a few places within your output directory (specified in `settings_design.yaml`): 60 | 61 | 1. **Individual Efficiency Files:** For each run size (e.g., 12 runs), inside its specific subfolder (e.g., `DesignArray_Nrun12/`), you'll find: 62 | * `Efficiencies_[factor_levels]_Nrun12.csv`: A file containing the calculated scores (Level Balance, Orthogonality, D1-Eff, etc.) for the best 12-run design found. 63 | * `Table_Pearson_Correlation.csv`: Shows the pairwise correlation between factors (related to Orthogonality). Low values (near 0) are good. 64 | * `Table_Interaction_Balance.txt`: Details about the balance of pairwise combinations. 65 | * `pairwise_correlation.png`: A plot visualizing the relationships and balance within the design (see image below). 66 | 67 | 2. **Combined Efficiency File:** In the main output directory, you'll find: 68 | * `Efficiencies_[factor_levels]_all3.csv`: A summary table listing the key efficiency scores for *all* the run sizes tested (e.g., 12, 18, 24...). This is very useful for comparing designs. 69 | 70 | *Example Snippet (`Efficiencies_..._all3.csv`):* 71 | ```csv 72 | Center Balance,Level Balance,Orthogonality,Two-level Balance,Two-level Min-Eff,D-Eff,D1-Eff,D2-Eff,A-Eff,A1-Eff,A2-Eff,Nexp 73 | 96.5,97.2,91.3,88.5,95.0,15.1,75.3,8.2,10.5,60.1,5.1,12 74 | 98.1,98.5,94.6,92.1,100.0,18.9,85.7,10.1,12.8,72.5,6.8,18 75 | 99.2,99.0,97.8,96.3,100.0,25.6,92.1,14.5,18.3,81.0,9.9,24 76 | ... (more rows for other run sizes) ... 77 | ``` 78 | *This table lets you see how scores improve (or plateau) as you increase the number of experiments (`Nexp`).* 79 | 80 | 3. **Efficiency Plot:** Also in the main output directory: 81 | * `Efficiencies_[factor_levels].png`: A plot showing how key efficiencies change with the number of runs. 82 | 83 | ![Example overview plot of the main efficiencies (from 0=worst to 100=best) as function of number of experiments.](https://github.com/sebhaan/DoEgen/blob/main/figures/Efficiencies.png){width=400} 84 | 85 | * **How to read the plot:** The X-axis is the number of experiments (runs). The Y-axis is the efficiency score (0-100). Each colored line represents a different metric. You generally want designs where the lines for key metrics (like Level Balance, Orthogonality, Two-level Min-Eff, D1-Eff) are high (close to 100). This plot helps visualize the trade-off: often, scores improve rapidly at first and then level off. You might choose a run size where the scores are acceptably high but before the lines become flat (diminishing returns). 86 | 87 | 4. **Pairwise Correlation Plot:** For each individual design (e.g., in `DesignArray_Nrun12/`): 88 | * `pairwise_correlation.png`: Shows scatter plots for each pair of factors in the design. 89 | 90 | ![Pairwise factor correlation plot of an example 8 factor design array with a mix of 3- and 2-level factors. The lines and blue shadows correspond to the linear regression fit and its uncertainty. Two pairs are 100% orthogonal if the linear regression line is horizontal. The diagonal bar charts show the histogram of level values for each factor (perfect level balance if histogram is flat).](https://github.com/sebhaan/DoEgen/blob/main/figures/pairwise_correlation.png){width=600} 91 | 92 | * **How to read the plot:** 93 | * **Diagonal:** Histograms showing how often each level was used for that factor. Flat histograms indicate good Level Balance. 94 | * **Off-Diagonal:** Scatter plots showing the combinations tested for pairs of factors. If the points fill the space somewhat evenly and the regression line (blue line) is mostly flat (horizontal), it indicates good Orthogonality between those two factors. Steep lines indicate correlation (bad for orthogonality). 95 | 96 | ## What's Happening Inside `evaluate_design2`? 97 | 98 | Let's peek under the hood to see the basic steps the `evaluate_design2` function takes when it receives a design array: 99 | 100 | ```mermaid 101 | sequenceDiagram 102 | participant OD as optimize_design() (from Ch 2) 103 | participant ED2 as evaluate_design2() Function 104 | participant Array as Design Array (Input) 105 | participant Calcs as Internal Calculations (Numpy, Pandas) 106 | participant Files as Output Files (.csv, .png) 107 | 108 | OD->>ED2: Calls evaluate_design2() with a generated Array 109 | ED2->>Array: Receives the numerical design array (0s, 1s, ...) 110 | ED2->>Calcs: Calculates Level Balance (counts levels per column) 111 | ED2->>Calcs: Normalizes Array (scales values, e.g., to -1, 1) 112 | ED2->>Calcs: Calculates Orthogonality (via Correlation matrix of normalized array) 113 | ED2->>Calcs: Creates Model Matrix X (using `create_model`) 114 | ED2->>Calcs: Calculates D-Efficiencies (Determinant of X^T*X) 115 | ED2->>Calcs: Calculates A-Efficiencies (Trace of inverse of X^T*X) 116 | ED2->>Calcs: Calculates Interaction Balance (counts pairs of levels across pairs of columns) 117 | Calcs-->>ED2: Returns calculated scores 118 | ED2->>Files: Saves detailed tables (Correlation, Interaction Balance) 119 | ED2->>Files: Saves Pairwise Correlation Plot (using Matplotlib/Seaborn) 120 | ED2-->>OD: Returns main efficiency scores (e.g., a list or tuple) 121 | ``` 122 | 123 | **Simplified Code View (`doegen/doegen.py`):** 124 | 125 | The `evaluate_design2` function uses libraries like `numpy` and `pandas` to perform these calculations. Here are highly simplified examples of the logic: 126 | 127 | ```python 128 | # Simplified view inside doegen/doegen.py - evaluate_design2 function 129 | 130 | import numpy as np 131 | import pandas as pd 132 | # ... other imports like itertools, matplotlib, seaborn ... 133 | 134 | def normalize_array(Array): 135 | """Scales array columns, e.g., from 0,1,2 to -1, 0, 1.""" 136 | # Simplified: Actual code handles different ranges properly 137 | colmax = np.max(Array, axis=0) 138 | colmin = np.min(Array, axis=0) 139 | # Avoid division by zero if a factor has only one level tested 140 | coldelta = np.where(colmax > colmin, colmax - colmin, 1) 141 | colmean = (colmax + colmin) / 2.0 142 | return 2 * (Array - colmean) / coldelta 143 | 144 | def calc_Deff(X): 145 | """Calculates D-efficiency from model matrix X.""" 146 | # D-eff relates to the determinant of the 'information matrix' (X^T * X) 147 | XX = np.dot(X.T, X) 148 | try: 149 | # Use slogdet for numerical stability 150 | _sign, logdet = np.linalg.slogdet(XX) 151 | # Geometric mean of eigenvalues, scaled 152 | det = np.exp(logdet / X.shape[1]) if _sign > 0 else 0 153 | except np.linalg.LinAlgError: 154 | det = 0 # Matrix might be singular (bad design) 155 | return 100 * det / X.shape[0] # Scaled score 0-100 156 | 157 | # def create_model(Array, mode=1): ... # Creates model matrix (see Ch 2) 158 | 159 | def evaluate_design2(setup, Array, printopt=False, dir_out=None, plotgrid=True): 160 | """Calculates various efficiency metrics for a design Array.""" 161 | runsize, number_of_factors = Array.shape 162 | fac_levels = setup.factor_levels # e.g., [3, 3, 2, 3] 163 | 164 | # --- Level Balance Calculation (Simplified) --- 165 | sum_imbalance = 0.0 166 | for col_idx, nlevel in enumerate(fac_levels): 167 | column = Array[:, col_idx].astype(int) 168 | ideal_count_per_level = runsize / nlevel 169 | counts = np.bincount(column, minlength=nlevel) # Count occurrences of 0, 1, ... 170 | imbalance = np.sum(np.abs(counts - ideal_count_per_level)) 171 | sum_imbalance += imbalance 172 | # Scale imbalance relative to total size, convert to % efficiency 173 | leveleff = 100 * (1 - sum_imbalance / (2 * (runsize - runsize / np.mean(fac_levels)))) # Formula nuance 174 | 175 | # --- Orthogonality (via Pearson Correlation) --- 176 | Anorm = normalize_array(Array) # Scale to -1 to 1 for fair correlation 177 | Acor_pearson = np.corrcoef(Anorm.T) # Calculate correlation matrix 178 | # Orthogonality relates to how close correlations are to zero 179 | # A simple (though not exact) proxy could involve off-diagonal sums 180 | ortho_measure = np.sum(np.abs(np.triu(Acor_pearson, k=1))) # Sum absolute off-diagonal correlations 181 | # Convert to 0-100 scale (lower correlation sum is better) 182 | orthoeff = 100 * (1 - ortho_measure / (number_of_factors * (number_of_factors - 1) / 2)) # Simplified scaling 183 | 184 | # --- D-Efficiency (Example for Main Effects D1-Eff) --- 185 | X1, _ = create_model(Anorm, mode=1, norm=False) # Model with main effects 186 | D1eff = calc_Deff(X1) 187 | 188 | # --- Interaction Balance (Conceptual) --- 189 | # Uses calc_twofactorbalance() internally 190 | # Involves iterating through pairs of columns (factors) 191 | # For each pair, count occurrences of level combinations (e.g., how often Factor A=0 and Factor B=1 occurs) 192 | # Compare counts to the ideal count (runsize / (levels_A * levels_B)) 193 | # Sum up deviations to get imbalance score, convert to % efficiency 194 | twoleveleff, twolevelmin, _ = calc_twofactorbalance(setup, Array) 195 | twoleveleff, twolevelmin = 100 * twoleveleff, 100 * twolevelmin 196 | 197 | # ... calculate other metrics (Center Balance, A-Eff, D-Eff, D2-Eff) ... 198 | 199 | # --- Save outputs if dir_out is provided --- 200 | if dir_out is not None: 201 | # Save correlation tables, interaction balance tables, plots... 202 | # (Code uses pandas DataFrames .to_csv() and matplotlib/seaborn .savefig()) 203 | pass 204 | 205 | # --- Return the main scores --- 206 | efficiencies = ( 207 | # centereff, # Calculated earlier 208 | leveleff, 209 | orthoeff, 210 | twoleveleff, 211 | twolevelmin, 212 | # Deff, D1eff, D2eff, # Calculated 213 | # Aeff, A1eff, A2eff, # Calculated 214 | # ... other scores ... 215 | ) # Actual function returns a specific tuple of ~11 scores 216 | # Simplified return for clarity: 217 | return (leveleff, orthoeff, D1eff, twoleveleff, twolevelmin) # Example subset 218 | ``` 219 | 220 | This code takes the design array, performs calculations (like counting, normalizing, computing correlations, building models, finding determinants), saves detailed diagnostics, and finally returns the key efficiency scores. 221 | 222 | ## Conclusion 223 | 224 | In this chapter, we learned about **Design Evaluation & Efficiency Metrics**. We saw that `DoEgen` doesn't just generate experiment plans; it also grades them using metrics like **Level Balance**, **Orthogonality**, **D-Efficiency**, and **Interaction Balance**. These scores, typically ranging from 0 to 100, tell us how "good" a design is in terms of fairness, independence, precision, and coverage of combinations. 225 | 226 | We learned that this evaluation happens automatically during the [Design Generation](02_design_generation_.md) step, producing `.csv` files and plots that summarize the efficiencies for different run sizes. Understanding these metrics and plots is crucial for making an informed decision about which experimental plan to actually use. 227 | 228 | Now that we know how to generate designs (Chapter 2) and how to evaluate their quality (Chapter 3), the next logical step is to use these evaluations to pick the best design for our specific needs and budget. Let's move on to [Chapter 4: Design Selection](04_design_selection_.md) to see how `DoEgen` helps us with this final step in planning our experiment. 229 | 230 | --- 231 | 232 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/04_design_selection_.md: -------------------------------------------------------------------------------- 1 | # Chapter 4: Design Selection 2 | 3 | In the previous chapter, [Chapter 3: Design Evaluation & Efficiency Metrics](03_design_evaluation___efficiency_metrics_.md), we learned how `DoEgen` grades the experimental plans (designs) it creates using scores like Level Balance, Orthogonality, and D-Efficiency. We generated and evaluated designs for various numbers of experimental runs (e.g., 12 runs, 18 runs, 24 runs, etc.). 4 | 5 | But now we have a bunch of potential plans, each with its own report card (efficiency scores). How do we pick the *right* one to actually use for our experiment? This is where **Design Selection** comes in! 6 | 7 | ## The Challenge: Choosing the Best Plan for You 8 | 9 | Imagine you're planning a trip. A travel agent (like `DoEgen`'s design generation) might show you several itineraries: 10 | * A quick, bare-bones trip (fewest days, covers just the essentials). 11 | * A well-rounded trip (moderate length, good mix of sights and relaxation). 12 | * A long, comprehensive trip (many days, sees absolutely everything). 13 | 14 | You wouldn't just pick one randomly! You'd consider your budget, how much time you have, and what's most important to you. 15 | 16 | Similarly, after `DoEgen` generates and evaluates designs with different numbers of runs (12, 18, 24...), we need to choose the one that best fits our experimental "budget" (how many runs we can afford) and our "goals" (how much detail and reliability we need). Doing this manually by comparing all the efficiency scores for all the run sizes can be tedious. 17 | 18 | ## The Solution: Automatic Recommendations 19 | 20 | `DoEgen` makes this easier by automatically suggesting a few good candidate designs based on the efficiency scores it calculated in Chapter 3. Think of it like the **travel agent highlighting three recommended options**: 21 | 22 | 1. **Minimum Design:** This is like the **cheapest valid plan**. It's the design with the *fewest* experimental runs that still meets some basic quality standards (e.g., good enough balance and orthogonality, covers essential combinations). It's suitable if you're on a very tight budget or just doing an initial screening. 23 | 24 | 2. **Optimal Design:** This is like the **best value-for-money plan**. It aims to find a sweet spot, balancing high quality (good efficiency scores) with a reasonable number of runs. It often requires more runs than the minimum, but the improvement in quality is usually worth the extra effort. 25 | 26 | 3. **Best Design:** This is like the **most comprehensive plan**. It's the design that achieves the highest overall quality score among all the generated options, even if it means doing quite a few more experiments. This is for situations where getting the absolute highest quality data is the top priority, and the number of runs is less of a constraint. 27 | 28 | This automatic selection process helps you quickly narrow down the choices to a few sensible options. 29 | 30 | ## How `DoEgen` Selects the Designs 31 | 32 | Good news! Design selection happens **automatically** at the very end of the [Design Generation](02_design_generation_.md) process. When you run: 33 | 34 | ```bash 35 | python -m doegen.doegen settings_design.yaml 36 | ``` 37 | 38 | After generating and evaluating designs for all requested run sizes (e.g., 12, 18, 24...), `DoEgen` performs one final step: it analyzes the collected efficiency data (specifically, the information stored in the `Efficiencies_[factor_levels]_all3.csv` file we saw in Chapter 3) and applies a set of rules to pick the Minimum, Optimal, and Best designs. 39 | 40 | ## The Selection Rules (Simplified View) 41 | 42 | `DoEgen` uses specific thresholds based on the efficiency metrics from Chapter 3. Here's a simplified idea of the rules (the exact percentages can be found in the `DoEgen` documentation, like `MANUAL.md` or `README.md`): 43 | 44 | 1. **Minimum Design Criteria:** 45 | * **Goal:** Find the *smallest* design that's basically sound. 46 | * **Rules:** 47 | * Must have enough runs (usually, number of runs `Nexp >= number of factors + 1`). 48 | * Must have good **Level Balance** (e.g., > 95%). 49 | * Must have good **Orthogonality** (e.g., > 90%). 50 | * Must test every pair of factor levels at least once (**Two-level Min-Eff** = 100%). 51 | * `DoEgen` looks through the evaluated designs, starting from the smallest run size, and picks the *first one* that meets all these conditions. 52 | 53 | 2. **Optimal Design Criteria:** 54 | * **Goal:** Find the best balance between quality and run count among designs meeting stricter criteria. 55 | * **Rules:** 56 | * Must meet even *higher* thresholds for **Level Balance** (e.g., > 98%) and **Orthogonality** (e.g., > 95%). 57 | * Must also meet the **Two-level Min-Eff** = 100% requirement. 58 | * Among the designs meeting these stricter rules, `DoEgen` often calculates a score that rewards high efficiency but adds a small penalty for increasing the number of runs significantly beyond the 'Minimum' design. It picks the design that maximizes this "value" score. 59 | 60 | 3. **Best Design Criteria:** 61 | * **Goal:** Find the design with the absolute highest overall quality score. 62 | * **Rules:** 63 | * `DoEgen` calculates an overall score for *all* generated designs. This score typically sums up key efficiencies (like Level Balance, Orthogonality, D1-Efficiency) and might include a small penalty based on the run size (to slightly favor smaller designs if scores are almost identical). 64 | * It simply picks the design with the highest calculated score, regardless of whether it's much larger than the Minimum or Optimal. 65 | 66 | ## Where to Find the Recommendations 67 | 68 | Once the `doegen.doegen` script finishes, you'll find the selection results in your specified output directory: 69 | 70 | 1. **Summary Text File:** 71 | * `Experiment_Design_selection_summary.txt` 72 | * This file gives you a clear, easy-to-read summary listing the chosen Minimum, Optimal, and Best designs, their run sizes (`Nexp`), and their key efficiency scores. This is usually the first place to look! 73 | 74 | *Example Snippet (`Experiment_Design_selection_summary.txt`):* 75 | ```text 76 | RESULTS OVERVIEW: 77 | -------------------------------- 78 | Minimum Exp Design Runsize: 30 79 | Optimal Exp Design Runsize: 72 80 | Best Exp Design Runsize: 90 81 | -------------------------------- 82 | 83 | 84 | Efficiencies: 85 | ------------------------------------------------------------------------------ 86 | Minimum Design Optimal Design Best Design 87 | Center Balance 96.800 99.500 99.600 88 | Level Balance 97.500 99.100 99.200 89 | Orthogonality 92.300 98.200 98.500 90 | Two-Way Interact Bal 91.800 97.100 97.800 91 | D Efficieny 19.500 35.800 38.200 92 | D1 Efficieny 88.100 96.400 97.100 93 | ``` 94 | 95 | 2. **Ready-to-Use Design Tables:** 96 | * `Designtable_minimum_NrunXX.csv` (e.g., `Designtable_minimum_Nrun30.csv`) 97 | * `Designtable_optimal_NrunYY.csv` (e.g., `Designtable_optimal_Nrun72.csv`) 98 | * `Designtable_best_NrunZZ.csv` (e.g., `Designtable_best_Nrun90.csv`) 99 | * These `.csv` files contain the actual experimental plans for the selected designs. Unlike the raw `EDarray...csv` files (which use numbers like 0, 1, 2), these tables show the real factor names and the actual level values (e.g., '180C', 'Catalyst X') you defined in your setup. They are ready for you to use to run your experiments! 100 | 101 | *Example Snippet (`Designtable_optimal_Nrun72.csv`):* 102 | ```csv 103 | Nexp,Temperature,Pressure,Catalyst,Speed 104 | 1,20,1,Catalyst X,100 105 | 2,30,5,Catalyst X,200 106 | 3,40,1,Catalyst X,300 107 | 4,20,5,Catalyst Y,300 108 | 5,30,1,Catalyst Y,100 109 | ... (72 rows total) ... 110 | ``` 111 | 112 | 3. **Efficiency Plot:** 113 | * `Efficiencies_[factor_levels].png` (Generated in Chapter 3, but useful here) 114 | * Looking back at this plot helps you visually understand the trade-offs. You can see where adding more runs gives diminishing returns (the curves flatten out), which might help you decide if the 'Optimal' or 'Best' design is worth the extra runs compared to the 'Minimum'. 115 | 116 | ## What Happens Under the Hood? 117 | 118 | The selection logic happens within the `main` function of `doegen/doegen.py` *after* all designs have been generated and their efficiencies calculated and stored in a combined array (let's call it `effs_array`). 119 | 120 | **Process Flow:** 121 | 122 | ```mermaid 123 | sequenceDiagram 124 | participant DG as DoEgen Main Script (doegen.py) 125 | participant OptLoop as Optimization Loop (Ch 2 & 3) 126 | participant EffArray as Combined Efficiency Data (effs_array) 127 | participant Rules as Selection Logic 128 | participant Output as Output Files (.txt, .csv) 129 | 130 | DG->>OptLoop: Generate & Evaluate designs for N=12, 18, 24... 131 | OptLoop-->>EffArray: Store all efficiency scores 132 | Note right of EffArray: Contains scores for all run sizes 133 | 134 | DG->>EffArray: Access the combined efficiency data 135 | DG->>Rules: Apply 'Minimum' criteria thresholds 136 | Rules-->>DG: Identify runsize for Minimum design (e.g., 30) 137 | DG->>Rules: Apply 'Optimal' criteria thresholds & scoring 138 | Rules-->>DG: Identify runsize for Optimal design (e.g., 72) 139 | DG->>Rules: Apply 'Best' scoring logic 140 | Rules-->>DG: Identify runsize for Best design (e.g., 90) 141 | 142 | DG->>Output: Write Experiment_Design_selection_summary.txt 143 | DG->>Output: Call array2valuetable() to create Designtable_minimum_Nrun30.csv 144 | DG->>Output: Call array2valuetable() to create Designtable_optimal_Nrun72.csv 145 | DG->>Output: Call array2valuetable() to create Designtable_best_Nrun90.csv 146 | 147 | ``` 148 | 149 | **Simplified Code View (`doegen/doegen.py` - near the end of `main` function):** 150 | 151 | ```python 152 | # Simplified view from doegen/doegen.py - main function logic (after optimization loop) 153 | 154 | # Assume effs_array is a NumPy array where rows are run sizes and columns are efficiencies 155 | # Assume xrun is a NumPy array with the corresponding run sizes (e.g., [12, 18, 24, ...]) 156 | 157 | def main(...): 158 | # ... (Code from Chapter 2 & 3: read setup, loop through run sizes, optimize, evaluate) ... 159 | # multi_effs = optimize_design_multi(...) # Collects efficiencies 160 | # effs_array = ... # Convert multi_effs into the array 161 | 162 | # ... (Save combined efficiencies plot and CSV as shown in Chapter 3) ... 163 | 164 | ###### Identify minimum, optimal, and best runsize ###### 165 | print("Finding minimum, optimal and best designs...") 166 | Result = namedtuple("Result", ["name", "runsize", "effs"]) 167 | results = {} # Dictionary to store the selected designs 168 | 169 | # --- Find Minimum Design --- 170 | # Apply thresholds using np.where: find indices where conditions are met 171 | min_thresholds_met = np.where( 172 | (effs_array[:, 0] >= 95) # Col 0: Center Balance 173 | & (effs_array[:, 1] >= 95) # Col 1: Level Balance 174 | & (effs_array[:, 2] >= 90) # Col 2: Orthogonality 175 | & (effs_array[:, 4] == 100) # Col 4: Two-level Min-Eff 176 | # & (xrun >= setup.number_of_factors + 1) # Implicitly handled by nrun_min usually 177 | )[0] # Get the indices that satisfy the conditions 178 | 179 | if len(min_thresholds_met) > 0: 180 | idx_min = min_thresholds_met[0] # Pick the first index (lowest run size) 181 | results["min"] = Result("minimum", xrun[idx_min], effs_array[idx_min]) 182 | print(f" Minimum design found: {results['min'].runsize} runs") 183 | else: 184 | print(" Warning: Could not find a design meeting minimum criteria.") 185 | 186 | # --- Find Optimal Design --- 187 | # Apply stricter thresholds 188 | opt_thresholds_met = np.where( 189 | (effs_array[:, 0] >= 98) 190 | & (effs_array[:, 1] >= 98) 191 | & (effs_array[:, 2] >= 95) 192 | # & (effs_array[:, 3] >= 95) # Col 3: Two-level Balance 193 | & (effs_array[:, 4] == 100) 194 | )[0] 195 | 196 | if len(opt_thresholds_met) > 0 and "min" in results: 197 | # Calculate a score for designs meeting optimal criteria 198 | # Score rewards efficiency, penalizes extra runs vs minimum 199 | runs_sel = xrun[opt_thresholds_met] 200 | score = ( 201 | effs_array[opt_thresholds_met, 0] # Center Bal 202 | + effs_array[opt_thresholds_met, 2] # Ortho 203 | + effs_array[opt_thresholds_met, 3] # 2-Way Bal 204 | + 0.5 * effs_array[opt_thresholds_met, 6] # D1-Eff (Col 6) 205 | - (4.0 / results["min"].runsize) * runs_sel # Penalty for run size 206 | ) 207 | idx_opt_relative = np.argmax(score) # Find index with max score *within the selection* 208 | idx_opt_absolute = opt_thresholds_met[idx_opt_relative] # Get original index 209 | results["opt"] = Result("optimal", xrun[idx_opt_absolute], effs_array[idx_opt_absolute]) 210 | print(f" Optimal design found: {results['opt'].runsize} runs") 211 | else: 212 | print(" Warning: Could not find a design meeting optimal criteria.") 213 | 214 | # --- Find Best Design --- 215 | # Calculate score based on overall quality, slight penalty for size 216 | score_best = ( 217 | effs_array[:, 0] # Center Bal 218 | + effs_array[:, 2] # Ortho 219 | + effs_array[:, 3] # 2-Way Bal 220 | + (100 * (effs_array[:, 4] - 100)) # Heavy penalty if MinEff != 100 221 | + 0.5 * effs_array[:, 6] # D1-Eff 222 | - (1.0 / nrun_max) * xrun # Small penalty for run size 223 | ) 224 | idx_best = np.argmax(score_best) # Find index with highest score overall 225 | results["best"] = Result("best", xrun[idx_best], effs_array[idx_best]) 226 | print(f" Best design found: {results['best'].runsize} runs") 227 | 228 | 229 | # --- Generate Output Files --- 230 | print("Saving minimum, optimal, and best design as experiment design tables...") 231 | # (Code to write the summary text file) 232 | print_designselection_summary(results, fname_out=os.path.join(outpath, "Experiment_Design_selection_summary.txt")) 233 | 234 | # Loop through selected results and create the final tables 235 | for result in results.values(): 236 | # Construct paths to the raw array file and the output table file 237 | fname_array = os.path.join(outpath, f"DesignArray_Nrun{result.runsize}", f"EDarray_{setup.factor_levels}_Nrun{result.runsize}.csv") 238 | fname_out = os.path.join(outpath, f"Designtable_{result.name}_Nrun{result.runsize}.csv") 239 | # Call the function to convert the raw array to a user-friendly table 240 | array2valuetable(setup, fname_array, fname_out) 241 | # (Optional: Code to append non-varied factors if any) 242 | 243 | print("\nFINISHED Design Selection") 244 | ``` 245 | 246 | This code snippet shows how `DoEgen` uses NumPy's array filtering (`np.where`) and calculations (`np.argmax`) to apply the selection rules and identify the indices corresponding to the Minimum, Optimal, and Best designs within the `effs_array`. Finally, it calls `array2valuetable` to create the human-readable `.csv` files for these selected designs. 247 | 248 | ## Remember: They are Suggestions! 249 | 250 | While the Minimum, Optimal, and Best suggestions are very helpful starting points, they are based on general rules. You are the expert on your experiment! 251 | 252 | * **Check the Summary and Plot:** Always look at the `Experiment_Design_selection_summary.txt` and the `Efficiencies...png` plot. 253 | * **Consider Your Constraints:** Maybe the 'Optimal' design suggests 72 runs, but your budget strictly limits you to 50. In that case, you might look at the efficiency plot and the `Efficiencies_..._all3.csv` file to find the best design available at or below 50 runs (perhaps the 48-run design). 254 | * **Manual Selection:** If you decide to use a design different from the suggested ones (e.g., you want the 48-run design), you can easily create its user-friendly table yourself. Find the corresponding raw array file (`EDarray_..._Nrun48.csv`) in its subfolder (`DesignArray_Nrun48/`) and use the `array2valuetable` function (or simply adapt the code snippet above) to convert it. 255 | 256 | ## Conclusion 257 | 258 | In this chapter, we learned about **Design Selection**, the helpful feature in `DoEgen` that automatically recommends candidate experimental plans after generation and evaluation. By suggesting a **Minimum** (cheapest valid), **Optimal** (best value), and **Best** (highest quality) design, it simplifies the process of choosing a final plan from the many options generated. 259 | 260 | We saw how this selection is based on predefined efficiency criteria applied to the results from [Chapter 3: Design Evaluation & Efficiency Metrics](03_design_evaluation___efficiency_metrics_.md), and where to find the recommendations (`Experiment_Design_selection_summary.txt`) and the ready-to-use plans (`Designtable_....csv`). 261 | 262 | With a well-evaluated and selected experimental plan in hand, we are finally ready to perform the actual experiments! The next step is to gather the results from these experiments and bring them back into `DoEgen`. 263 | 264 | Let's move on to [Chapter 5: Experiment Result Input & Merging](05_experiment_result_input___merging_.md) to see how we manage the data coming back from our experimental runs. 265 | 266 | --- 267 | 268 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/05_experiment_result_input___merging_.md: -------------------------------------------------------------------------------- 1 | # Chapter 5: Experiment Result Input & Merging 2 | 3 | In [Chapter 4: Design Selection](04_design_selection_.md), we saw how `DoEgen` helps us choose the best experimental plan (like the `Designtable_optimal_Nrun72.csv` file) based on efficiency metrics. Now comes the exciting part: you've actually *run* those experiments! Maybe you baked the cakes, grew the plants, or ran the simulations according to the plan. 4 | 5 | Now you have the results – how tasty was each cake? How tall did each plant grow? What was the output of each simulation? This chapter is all about taking those real-world results and getting them ready to be analyzed by `DoEgen`. 6 | 7 | ## The Goal: Matching Results to the Plan 8 | 9 | Imagine you followed the test drive schedule from Chapter 4. For each drive (each row in your `Designtable_...csv`), you recorded the actual fuel efficiency (Miles Per Gallon or Liters per 100km). 10 | 11 | Now you have two pieces of information: 12 | 1. **The Plan:** Which car settings (engine, transmission, etc.) were used for each specific test drive (e.g., Drive #1, Drive #2...). This is in your `Designtable_...csv` file. 13 | 2. **The Results:** The actual fuel efficiency you measured for each test drive (e.g., Drive #1 got 25 MPG, Drive #2 got 30 MPG...). This might be scribbled in a notebook or typed somewhere else. 14 | 15 | The goal of **Experiment Result Input & Merging** is to **combine these two pieces of information systematically**. We need to match the result (e.g., 25 MPG) back to the exact conditions that produced it (Drive #1: specific engine, transmission, etc.). 16 | 17 | `DoEgen` needs this combined information to figure out how the different factors (engine, transmission) influenced the outcome (fuel efficiency). 18 | 19 | ## The Tool: The Experiment Results Excel Template 20 | 21 | Just like we used an Excel template to define our experiment setup in [Chapter 1: Experiment Setup Definition 22 | ](01_experiment_setup_definition_.md), `DoEgen` uses another Excel template to collect your experimental results. This ensures the results are in a structured format that `DoEgen` can easily understand and merge with the original design. 23 | 24 | You can create a blank template file using a helper script included with `DoEgen`: 25 | 26 | ```bash 27 | # Run this command in your terminal in the DoEgen project directory 28 | python -m doegen.create_resultfile 29 | ``` 30 | 31 | This command creates an Excel file named `Experiment_results_template.xlsx`. 32 | 33 | Let's look at the key columns in this template: 34 | 35 | *(Based on the image from `MANUAL.md`)* 36 | ![Experiment Result Table Header.](https://github.com/sebhaan/DoEgen/blob/main/figures/Result_header.png){width=600} 37 | 38 | | Column Header | Description | Example Value | Why it's Important | 39 | | :------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------- | :----------------------------------------------------- | 40 | | `Nexp` | **Experiment Run Number.** This *must* match the `Nexp` number from your `Designtable_....csv` file. | `1`, `2`, ... | **Crucial!** This is how results are linked to factors. | 41 | | `PID` | **Point ID (Optional).** Use this if you measure results at multiple locations or times *within* the same experiment run (e.g., different sensors, different time points). | `SensorA`, `1` | Allows for more detailed, repeated measures analysis. | 42 | | `Y Label` | **Result Name (Optional).** Use this if you measure multiple *different types* of results for each run (e.g., 'Taste Score' and 'Baking Time' for the cake). | `Taste`, `MPG` | Allows analysis of multiple outcome variables. | 43 | | `Y Exp` | **The Actual Measured Result.** This is the outcome value you observed for the specific run (`Nexp`), point (`PID`), and result type (`Y Label`). | `8.5`, `25.3` | **The core data!** This is what you measured. | 44 | | `Y Truth` | **True/Target Value (Optional).** If you know the 'correct' or expected value (e.g., in simulations or calibration), enter it here. | `9.0`, `26.0` | Used for calculating accuracy (like RMSE) in analysis. | 45 | | `Std Y Exp` | Standard Deviation of `Y Exp` (Optional). | `0.2` | For advanced analysis considering measurement noise. | 46 | | `Std Y Truth` | Standard Deviation of `Y Truth` (Optional). | `0.1` | For advanced analysis. | 47 | | `Weight PID` | Weight for this specific point (Optional). | `1.0` | For advanced weighted analysis. | 48 | 49 | **The most important columns for basic use are `Nexp` and `Y Exp`.** 50 | 51 | ## Filling Out the Results Template 52 | 53 | Let's say you used the `Designtable_optimal_Nrun72.csv` from Chapter 4 for your experiment. It had 72 runs (`Nexp` from 1 to 72). You measured one result, let's call it 'Yield'. You only measured it once per run (so `PID` and `Y Label` can be simple, like `1`). 54 | 55 | Here’s how you might start filling out `Experiment_results_template.xlsx`: 56 | 57 | | Nexp | PID | Y Label | Y Exp | Y Truth | Std Y Exp | Std Y Truth | Weight PID | 58 | | :--- | :-: | :------ | :---- | :------ | :-------- | :---------- | :--------- | 59 | | 1 | 1 | Yield | 85.2 | | | | | 60 | | 2 | 1 | Yield | 91.5 | | | | | 61 | | 3 | 1 | Yield | 88.0 | | | | | 62 | | ... | ... | ... | ... | ... | ... | ... | ... | 63 | | 72 | 1 | Yield | 93.1 | | | | | 64 | 65 | *You would fill in the actual `Y Exp` value you measured for each of the 72 runs.* If you had multiple PIDs or Y Labels, you would have more rows. For example, if run `Nexp=1` had `PID=SensorA` and `PID=SensorB`, you'd have two rows for `Nexp=1`. 66 | 67 | **Key Point:** The `Nexp` column in your results file is the bridge connecting your measured `Y Exp` back to the specific factor settings used in that run, which are listed in the corresponding `Nexp` row of your `Designtable_....csv` file. 68 | 69 | ## How `DoEgen` Merges the Data (Internal View) 70 | 71 | You don't usually run a separate command just for merging. The merging happens *inside* the next step: the result analysis module ([Chapter 6: Result Analysis & Statistics 72 | ](06_result_analysis___statistics_.md)). When you run the analysis script (`doegen.doeval`), one of the first things it does is read both your design table and your results file and combine them. 73 | 74 | **Step-by-step merging process:** 75 | 76 | 1. **Read Design:** The analysis module reads the selected design table (e.g., `Designtable_optimal_Nrun72.csv`) which contains the factor settings for each `Nexp`. 77 | 2. **Read Results:** It reads your filled-in results file (e.g., `Experiment_results_Nrun72.xlsx`). 78 | 3. **Match 'Nexp':** It uses the `Nexp` column as the key to link the two tables. For each row in the results file, it finds the row in the design table with the *same* `Nexp`. 79 | 4. **Combine:** It creates a new, combined table that includes both the factor settings (from the design table) and the measured results (from the results file) for each experiment run. 80 | 81 | **Sequence Diagram:** 82 | 83 | ```mermaid 84 | sequenceDiagram 85 | participant U as User 86 | participant DEA as DoEgen Analysis (doeval.py) 87 | participant MER as merge_expresults() Function 88 | participant PD as Pandas Library 89 | participant CDT as Combined Data Table 90 | 91 | U->>DEA: Runs analysis with paths to design & result files 92 | DEA->>MER: Calls merge_expresults(result_file, design_file) 93 | MER->>PD: Asks Pandas to read Design CSV file 94 | PD-->>MER: Returns Design DataFrame (Table) 95 | MER->>PD: Asks Pandas to read Results Excel file 96 | PD-->>MER: Returns Results DataFrame (Table) 97 | MER->>PD: Asks Pandas to merge the two tables ON 'Nexp' 98 | PD-->>MER: Returns the combined DataFrame 99 | MER-->>DEA: Returns the merged data 100 | DEA->>CDT: Stores the combined data for analysis 101 | Note right of CDT: Ready for Chapter 6! 102 | ``` 103 | 104 | This diagram shows that the `doeval.py` script uses a helper function (`merge_expresults`), which in turn uses the powerful `pandas` library to read the files and perform the merge based on the `Nexp` column. 105 | 106 | ## Diving Deeper into the Code (Simplified View) 107 | 108 | The function responsible for this merging is typically `merge_expresults` inside the `doegen/doeval.py` script. Let's look at a simplified version: 109 | 110 | ```python 111 | # Simplified view from doegen/doeval.py - merge_expresults function 112 | import pandas as pd 113 | 114 | def merge_expresults(fname_result, fname_design, y_label=None): 115 | """ 116 | Reads experiment results and merges with the design parameter file. 117 | 118 | Args: 119 | fname_result (str): Path to the experimental results file (Excel). 120 | fname_design (str): Path to the experimental design file (CSV or Excel). 121 | y_label (str, optional): Filter results for a specific Y Label. Defaults to None. 122 | 123 | Returns: 124 | pandas.DataFrame: A combined table with factors and results. 125 | """ 126 | try: 127 | # Read the results file (Excel) 128 | print(f"Reading results file: {fname_result}") 129 | dfres = pd.read_excel(fname_result) 130 | 131 | # Read the design file (Can be CSV or Excel) 132 | print(f"Reading design file: {fname_design}") 133 | if fname_design.endswith('.csv'): 134 | dfdes = pd.read_csv(fname_design) 135 | else: 136 | dfdes = pd.read_excel(fname_design) 137 | 138 | # Optional: Filter results for a specific 'Y Label' if provided 139 | if y_label is not None: 140 | print(f"Filtering results for Y Label: {y_label}") 141 | dfres = dfres[dfres["Y Label"] == y_label] 142 | 143 | # --- The Core Merging Step --- 144 | # Use pandas merge function. It looks for common columns ('Nexp' here). 145 | # 'how="left"' means keep all rows from the results (left) table 146 | # and add matching data from the design (right) table. 147 | print(f"Merging results and design based on 'Nexp' column...") 148 | dfcomb = dfres.merge(dfdes, on="Nexp", how="left") 149 | 150 | print(f"Successfully merged data. Combined table has {dfcomb.shape[0]} rows and {dfcomb.shape[1]} columns.") 151 | return dfcomb 152 | 153 | except FileNotFoundError as e: 154 | print(f"Error: File not found - {e}") 155 | return None 156 | except Exception as e: 157 | print(f"Error during merging: {e}") 158 | return None 159 | 160 | # How it might be called inside doeval.py (simplified): 161 | # design_file = "output/Designtable_optimal_Nrun72.csv" 162 | # results_file = "data/Experiment_results_Nrun72.xlsx" 163 | # combined_data = merge_expresults(results_file, design_file) 164 | # if combined_data is not None: 165 | # # Proceed with analysis using combined_data... (Chapter 6) 166 | # pass 167 | ``` 168 | 169 | This code snippet shows the key steps: 170 | 1. It uses `pandas.read_excel` and `pandas.read_csv` to load your data into tables (called DataFrames). 171 | 2. It optionally filters the results based on the `Y Label`. 172 | 3. The magic happens with `dfres.merge(dfdes, on="Nexp", how="left")`. This tells pandas: "Take the results table (`dfres`), find the matching `Nexp` row in the design table (`dfdes`), and combine the columns into a single new table." 173 | 4. It returns this combined table, ready for the statistical analysis in the next chapter. 174 | 175 | ## Conclusion 176 | 177 | In this chapter, we focused on the crucial step of preparing your experimental results for `DoEgen`. We learned about the **Experiment Results Excel template** and the importance of structuring your data, especially using the `Nexp` column to link results back to the specific experimental conditions (factors and levels) from your chosen design table. 178 | 179 | We saw that while you manually fill in the results template, the actual **merging** of results with the design plan happens automatically as the first step within the analysis module (`doeval.py`), using the `pandas` library. 180 | 181 | Now that we have a single, combined table containing both the experimental plan *and* the measured outcomes, we are perfectly set up to finally analyze the data and understand the impact of our factors. 182 | 183 | Let's proceed to [Chapter 6: Result Analysis & Statistics 184 | ](06_result_analysis___statistics_.md) to learn how `DoEgen` helps us make sense of these combined results! 185 | 186 | --- 187 | 188 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/06_result_analysis___statistics_.md: -------------------------------------------------------------------------------- 1 | # Chapter 6: Result Analysis & Statistics 2 | 3 | Welcome back! In [Chapter 5: Experiment Result Input & Merging 4 | ](05_experiment_result_input___merging_.md), we successfully combined our experimental plan (the factor settings) with the actual results we measured (`Y Exp` values). We now have a single, rich dataset ready for interrogation! 5 | 6 | But just having the data isn't enough. We need to make sense of it. Which factors actually *mattered*? Which settings led to the best outcomes? This chapter introduces **Result Analysis & Statistics**, the part of `DoEgen` that helps you answer these critical questions. 7 | 8 | ## What's the Big Idea? Finding the Story in Your Data 9 | 10 | Imagine you completed all the test drives from our car fuel economy example (Chapter 5). You diligently recorded the MPG for each car configuration you tested. Now, you have a spreadsheet full of numbers. What next? 11 | 12 | You'd want to analyze this data to find the story: 13 | * **Performance Check:** How good were the results overall? If you knew the "official" MPG ratings (`Y Truth`), how close were your measurements? (This is like calculating accuracy). 14 | * **Key Influencers:** Did changing the `Engine Type` have a huge impact on MPG? What about `Tire Pressure`? Or was `Paint Color` irrelevant? (This is finding factor importance). 15 | * **Top Performers:** Which specific combination of `Engine Type`, `Tire Pressure`, etc., gave the absolute best fuel economy in your tests? (This is identifying the best settings). 16 | 17 | **Result Analysis & Statistics** in `DoEgen` does precisely this kind of analysis automatically. It processes your combined data to extract meaningful insights, helping you understand *what* happened in your experiment and *why*. 18 | 19 | ## Key Analysis Tasks `DoEgen` Performs 20 | 21 | `DoEgen`'s analysis module (`doeval.py`) focuses on several key tasks: 22 | 23 | 1. **Calculating RMSE (if `Y Truth` is available):** 24 | * **What it is:** Root Mean Square Error (RMSE) measures the average difference between your experimental results (`Y Exp`) and the known true values (`Y Truth`). 25 | * **Analogy:** If the car manufacturer stated a car gets 30 MPG (`Y Truth`), and your test drive measured 28 MPG (`Y Exp`), the error is 2 MPG. RMSE calculates an overall "average error" across all your tests. 26 | * **Why it matters:** A lower RMSE indicates your experimental results were closer to the true values, suggesting higher accuracy or a better model fit (in simulations). 27 | 28 | 2. **Determining Factor "Importance":** 29 | * **What it is:** This identifies how much influence each factor has on the outcome (`Y Exp`). It calculates the *range* of the average outcome when changing a factor's levels. 30 | * **Analogy:** If the average MPG for 'Electric' engines was 50 and for 'Petrol' engines was 25, the range (importance) for `Engine Type` is 50 - 25 = 25 MPG (a big impact!). If the average MPG for 'Red' paint was 30.1 and 'Blue' paint was 30.0, the range for `Paint Color` is only 0.1 MPG (very low importance). 31 | * **Why it matters:** Helps you focus on the factors that actually drive the results and ignore the ones that don't make much difference. 32 | 33 | 3. **Identifying Top Performers:** 34 | * **What it is:** If you provided `Y Truth`, `DoEgen` ranks the experiments based on the lowest RMSE (most accurate runs). It lists the factor settings for these top-performing runs. 35 | * **Analogy:** Listing the top 5 car configurations from your test drives that had the smallest difference between your measured MPG and the official MPG rating. 36 | * **Why it matters:** Helps you pinpoint the specific settings that achieved the best (or most accurate) results in your experiment. 37 | 38 | 4. **Correlation Analysis:** 39 | * **What it is:** Examines the relationship between each factor and the outcome (Y). Does increasing Factor X tend to increase Y? Decrease Y? Or have no clear relationship? 40 | * **Analogy:** Does higher `Tire Pressure` generally lead to higher `MPG`? 41 | * **Why it matters:** Gives insights into the direction and strength of the relationship between inputs and outputs. 42 | 43 | ## How to Run the Analysis 44 | 45 | Running the result analysis is straightforward. You'll need: 46 | 47 | 1. Your **Design Table** file (e.g., `Designtable_optimal_Nrun72.csv` from [Chapter 4: Design Selection 48 | ](04_design_selection_.md)). 49 | 2. Your filled-in **Experiment Results** file (e.g., `Experiment_results_Nrun72.xlsx` from [Chapter 5: Experiment Result Input & Merging 50 | ](05_experiment_result_input___merging_.md)). 51 | 3. A **Settings file** for the analysis (usually `settings_expresults.yaml`). This file tells `DoEgen` where to find your input files and where to save the analysis outputs. You can create default templates using `python -m doegen.init_config` if needed. 52 | 53 | Once these are ready, you run the `doeval` module from your terminal: 54 | 55 | ```bash 56 | # Make sure your design and result files are ready 57 | # Make sure your settings file (e.g., settings_expresults.yaml) points to them 58 | 59 | # Run the result evaluation module 60 | python -m doegen.doeval settings_expresults.yaml 61 | ``` 62 | 63 | **What does this command do?** 64 | * It tells Python to run the `doeval` module within the `doegen` package. 65 | * It passes the `settings_expresults.yaml` file, which contains the necessary file paths and analysis options. 66 | 67 | **What happens next?** 68 | `DoEgen` will: 69 | 1. Read the settings file. 70 | 2. Read your design table and results file. 71 | 3. **Merge** the two tables based on the `Nexp` column (as discussed in Chapter 5). 72 | 4. Perform the statistical analyses (RMSE, Importance, Correlations, Top Performers) on the merged data. 73 | 5. Save the results as tables (`.csv` files) and plots (`.png` files) in the output directory specified in your settings file. 74 | 75 | ## Understanding the Analysis Outputs 76 | 77 | After running `doeval`, look inside your specified output folder (e.g., `test/expresults/` in the example). You'll find several helpful files for each 'Y Label' you analyzed: 78 | 79 | * **Factor Importance:** 80 | * `Experiment_[Y_Label]_Factorimportance.csv`: A table listing each factor and its calculated importance (Yrange), along with the min, max, mean, and standard deviation of the average Y value across its levels. 81 | * `Ybarplot_[Y_Label].png`: A bar chart visually showing the importance (range) of each factor. Factors with longer bars have a bigger impact. 82 | ![Factor Importance ranked from maximum to lowest change (range) in Y](https://github.com/sebhaan/DoEgen/blob/main/figures/Ybarplot_1.png){width=600} 83 | 84 | * **RMSE and Top Performers (if `Y Truth` was provided):** 85 | * `Experiment_[Y_Label]_RMSE.csv`: The combined data table with an added 'RMSE' column showing the calculated error for each experiment run. 86 | * `Experiment_[Y_Label]_RMSE_TopN_sorted.csv`: A table showing the factor settings for the top N experiments that had the *lowest* RMSE (i.e., the most accurate runs). 87 | ![Picture of Table `Experiment_1_RMSE_Top10_sorted.csv` which shows the factor values of the top 10 experiments based on their RSME values.](https://github.com/sebhaan/DoEgen/blob/main/figures/Top10.png){width=600} 88 | * `BestFactor_Avg[Y_Label].png`: A bar chart showing the average factor settings for the top N experiments, weighted by their RMSE. This gives an idea of the optimal settings based on accuracy. 89 | ![Factor values of the top 10 experiments based on their RSME values. The bar heights indicate the top factor’s average value and the dark lines their standard deviation. Note that the average and their standard deviation are computed with the weights RMSE^(-2).](https://github.com/sebhaan/DoEgen/blob/main/figures/BestFactor_Avg1.png){width=600} 90 | 91 | * **Correlation Plots:** 92 | * `Expresult_correlation_X-Y_[Y_Label].png`: Shows scatter plots of the outcome (Y Exp Mean) versus each *numeric* factor, with a regression line showing the trend. Helps visualize linear relationships. 93 | ![Overview plot of X-Y Correlation for each factor as function of their level values. On top the linear regression coefficient `r` is shown along the linear regression fit and its uncertainty (line and shadow).](https://github.com/sebhaan/DoEgen/blob/main/figures/Expresult_correlation_X_1.png){width=600} 94 | * `Y-pairwise-correlation_[Y_Label].png`: A "corner plot" showing heatmaps for every pair of factors. The color indicates the average outcome (Y Exp Mean) for that combination of factor levels. Useful for seeing interactions and how combinations affect the result. 95 | ![Cornerplot of pairwise factor relation with Y. The color(bar) indicates the value of Y.](https://github.com/sebhaan/DoEgen/blob/main/figures/Expresult_pairwise-correlation_1.png){width=600} 96 | * (Similar plots for RMSE vs. factors are also generated if `Y Truth` is available). 97 | 98 | These outputs provide a comprehensive overview of your experiment's results. 99 | 100 | ## What's Happening Under the Hood? 101 | 102 | Let's look at the main steps `doeval.py` takes when you run the analysis command: 103 | 104 | 1. **Read Settings:** Loads the paths and options from your `settings_expresults.yaml` file. 105 | 2. **Merge Data:** Calls the `merge_expresults` function (from `doegen/doeval.py`, discussed in Chapter 5) to read the design (`.csv`) and results (`.xlsx`) files and combine them into a single pandas DataFrame based on the `Nexp` column. 106 | 3. **Calculate Statistics (`calc_expresults_stats`):** This is the core analysis function (in `doegen/doeval.py`). It iterates through each `Y Label` (if you have multiple outcomes): 107 | * Calculates average `Y Exp` and `Y Truth` (if available) for each `Nexp` (handling multiple PIDs if present). 108 | * **Factor Importance:** For each factor, it groups the data by the factor's levels, calculates the average `Y Exp` for each level, and finds the range (max avg - min avg). 109 | * **RMSE:** If `Y Truth` is present, calculates the RMSE for each `Nexp`. 110 | * **Top Performers:** Sorts the results by RMSE and identifies the top N runs. Calculates weighted averages of factor settings for these top runs. 111 | * Saves the calculated statistics to `.csv` files. 112 | 4. **Generate Plots:** Calls various plotting functions (like `plot_3dmap`, `plot_regression`, also in `doegen/doeval.py`) using the calculated statistics and the merged data to create the `.png` visualizations. 113 | 114 | **Sequence Diagram:** 115 | 116 | ```mermaid 117 | sequenceDiagram 118 | participant U as User 119 | participant DEV as doeval.py (Main Script) 120 | participant MER as merge_expresults() 121 | participant CALC as calc_expresults_stats() 122 | participant PLOT as Plotting Functions 123 | participant Files as Output Files (.csv, .png) 124 | 125 | U->>DEV: Runs `python -m doegen.doeval settings.yaml` 126 | DEV->>DEV: Reads settings_expresults.yaml 127 | DEV->>MER: Calls merge_expresults(results_file, design_file) 128 | MER-->>DEV: Returns combined DataFrame (Merged Data) 129 | DEV->>CALC: Calls calc_expresults_stats(Merged Data) 130 | Note right of CALC: Calculates Importance, RMSE, Top N... 131 | CALC-->>DEV: Returns statistics / Modifies DataFrame 132 | DEV->>Files: Saves statistics tables (.csv) 133 | DEV->>PLOT: Calls plotting functions (plot_3dmap, plot_regression...) 134 | PLOT->>Files: Saves plots (.png) 135 | DEV-->>U: Prints "FINISHED" message 136 | ``` 137 | 138 | ## Diving Deeper into the Code (Simplified View) 139 | 140 | The main logic resides in `doegen/doeval.py`. 141 | 142 | 1. **Main Execution (`main` function):** Orchestrates the process. 143 | 144 | ```python 145 | # Simplified view from doegen/doeval.py - main function 146 | 147 | import pandas as pd 148 | import yaml 149 | from pathlib import Path # For handling file paths 150 | 151 | # Import helper functions from the same file 152 | from .doeval import merge_expresults, calc_expresults_stats, plot_3dmap, plot_regression # ... other plotting functions 153 | 154 | def main(inpath, fname_results, fname_design, outpath=None): 155 | # --- Setup Paths --- 156 | inpath = Path(inpath) 157 | if outpath is None: 158 | outpath = inpath 159 | else: 160 | outpath = Path(outpath) 161 | outpath.mkdir(parents=True, exist_ok=True) # Create output folder if needed 162 | 163 | # --- 1. Read Design and Results --- 164 | # (Uses pandas internally as shown in Chapter 5) 165 | print("Reading and merging design and result files...") 166 | dfcomb = merge_expresults(inpath / fname_results, inpath / fname_design) 167 | if dfcomb is None: 168 | print("Error during file reading/merging. Exiting.") 169 | return # Stop if merging failed 170 | 171 | # Get design table separately for stats calculation logic 172 | if str(fname_design).endswith('.csv'): 173 | dfdes = pd.read_csv(inpath / fname_design) 174 | else: 175 | dfdes = pd.read_excel(inpath / fname_design) 176 | # Filter out constant factors if any 177 | dfdes = dfdes[dfdes.columns[dfdes.nunique() > 1]].copy() 178 | params = list(dfdes)[1:] # Get factor names 179 | 180 | # Get the unique result types (Y Labels) 181 | try: 182 | ylabels = dfcomb["Y Label"].unique() 183 | except KeyError: # Handle case where 'Y Label' column might be missing 184 | print("Warning: 'Y Label' column not found. Assuming a single result type 'Y1'.") 185 | dfcomb["Y Label"] = 'Y1' 186 | ylabels = dfcomb["Y Label"].unique() 187 | 188 | # --- 2. Calculate Statistics --- 189 | print("Calculating statistics (Importance, RMSE, Top Performers)...") 190 | # Pass the original results DataFrame (dfcomb) and design DataFrame (dfdes) 191 | calc_expresults_stats(ylabels, dfdes, dfcomb, outpath) 192 | # This function saves its own CSV outputs internally 193 | 194 | # --- 3. Generate Plots --- 195 | print("Generating plots...") 196 | for ylabel in ylabels: 197 | print(f" Plotting for Y Label: {ylabel}") 198 | # Reload the RMSE results saved by calc_expresults_stats 199 | try: 200 | df_results_for_ylabel = pd.read_csv(outpath / f"Experiment_{ylabel}_RMSE.csv") 201 | except FileNotFoundError: 202 | print(f"Warning: RMSE file for {ylabel} not found, skipping some plots.") 203 | continue # Skip to next ylabel if file doesn't exist 204 | 205 | # Call plotting functions 206 | plot_3dmap(df_results_for_ylabel, params, "Y Exp Mean", 207 | outpath / f"Y-pairwise-correlation_{ylabel}.png") 208 | plot_regression(df_results_for_ylabel, params, 'Y Exp Mean', 209 | outpath / f"Expresult_correlation_X-Y_{ylabel}.png") 210 | 211 | # Plot RMSE-related plots only if RMSE column exists 212 | if 'RMSE' in df_results_for_ylabel.columns: 213 | plot_3dmap(df_results_for_ylabel, params, "RMSE", 214 | outpath / f"RMSE-pairwise-correlation_{ylabel}.png") 215 | # plot_factordis(df_results_for_ylabel, params, 'RMSE', # Example of another plot 216 | # outpath / f"Expresult_distribution_X-RMSE_{ylabel}.png") 217 | 218 | print("FINISHED Result Analysis") 219 | 220 | # The script uses argparse to read the settings file path from the command line 221 | # and then calls main(**cfg) where cfg is the dictionary loaded from YAML 222 | ``` 223 | 224 | 2. **Core Statistics (`calc_expresults_stats`):** This function does the heavy lifting. 225 | 226 | ```python 227 | # Simplified logic inside doegen/doeval.py - calc_expresults_stats function 228 | 229 | import numpy as np 230 | import pandas as pd 231 | import matplotlib.pyplot as plt # Used for the importance bar plot 232 | 233 | def calc_expresults_stats(ylabels, dfdes, dfres, outpath): 234 | params = list(dfdes)[1:] # Factor names 235 | npar = len(params) 236 | nexp = dfdes.shape[0] 237 | 238 | for ylabel in ylabels: 239 | print(f"-- Analyzing Y Label: {ylabel} --") 240 | # --- Prepare Data for this Y Label --- 241 | ydf = dfres[dfres["Y Label"] == ylabel].copy() # Filter results for this Y 242 | # Calculate mean Y values per experiment run (Nexp) 243 | ymean = ydf.groupby("Nexp")["Y Exp"].mean() 244 | ystd = ydf.groupby("Nexp")["Y Exp"].std() 245 | # Add these means to a copy of the design DataFrame 246 | dfdes_y = dfdes.copy() 247 | dfdes_y["Y Exp Mean"] = ymean 248 | dfdes_y["Y Exp Std"] = ystd 249 | 250 | # --- Calculate Factor Importance --- 251 | factor_importance = [] 252 | for i, param in enumerate(params): 253 | levels = dfdes_y[param].unique() 254 | avg_y_per_level = [] 255 | for level in levels: 256 | # Get average Y for runs where factor 'param' was at 'level' 257 | avg_y = dfdes_y.loc[dfdes_y[param] == level, "Y Exp Mean"].mean() 258 | avg_y_per_level.append(avg_y) 259 | # Importance = Range of average Y values across levels 260 | yrange = np.nanmax(avg_y_per_level) - np.nanmin(avg_y_per_level) 261 | factor_importance.append({ 262 | 'Factor': param, 'Yrange': yrange, 263 | 'Ymin': np.nanmin(avg_y_per_level), 'Ymax': np.nanmax(avg_y_per_level), 264 | 'Ymean': np.nanmean(avg_y_per_level), 'Ystd': np.nanstd(avg_y_per_level) 265 | }) 266 | # Save importance results 267 | df_importance = pd.DataFrame(factor_importance).set_index('Factor') 268 | df_importance.to_csv(outpath / f"Experiment_{ylabel}_Factorimportance.csv") 269 | # Plot importance bar chart (simplified call) 270 | df_importance.sort_values('Yrange')['Yrange'].plot(kind='barh', title=f'Importance (Range) {ylabel}') 271 | plt.tight_layout() 272 | plt.savefig(outpath / f"Ybarplot_{ylabel}.png") 273 | plt.close() 274 | 275 | # --- Calculate RMSE (if Y Truth exists) --- 276 | if "Y Truth" in ydf.columns and ydf["Y Truth"].notnull().any(): 277 | print(" Calculating RMSE...") 278 | ytruemean = ydf.groupby("Nexp")["Y Truth"].mean() 279 | dfdes_y["Y Truth Mean"] = ytruemean 280 | # Calculate squared error for each Nexp 281 | sq_error = (dfdes_y["Y Exp Mean"] - dfdes_y["Y Truth Mean"])**2 282 | # Need to handle potential multiple PIDs per Nexp correctly for RMSE 283 | # (Actual code might need more careful averaging of squared errors before sqrt) 284 | # Simplified: Assume one value per Nexp for RMSE calculation here 285 | rmse = np.sqrt(sq_error) # Simplified: Should average before sqrt if multiple PIDs 286 | dfdes_y["RMSE"] = rmse 287 | # Save combined table with RMSE 288 | dfdes_y.to_csv(outpath / f"Experiment_{ylabel}_RMSE.csv", index=False) 289 | 290 | # --- Identify Top Performers --- 291 | print(" Identifying Top Performers by RMSE...") 292 | nsel = min(10, max(3, nexp // 5)) # Select top ~20%, between 3 and 10 293 | dfsort = dfdes_y.sort_values("RMSE").head(nsel) 294 | dfsort.to_csv(outpath / f"Experiment_{ylabel}_RMSE_Top{nsel}_sorted.csv", index=False) 295 | 296 | # (Actual code also calculates weighted average parameters for top performers 297 | # and plots them using helper functions like weighted_avg_and_std and plot_table) 298 | else: 299 | print(" 'Y Truth' not found or empty, skipping RMSE calculations.") 300 | # Save the table without RMSE if Y Truth was missing 301 | dfdes_y.to_csv(outpath / f"Experiment_{ylabel}_results_summary.csv", index=False) 302 | ``` 303 | 304 | These snippets illustrate how `DoEgen` loads data, iterates through factors and outcomes, performs calculations using `pandas` and `numpy`, and generates outputs. 305 | 306 | ## Conclusion 307 | 308 | In this chapter, we dove into **Result Analysis & Statistics**. We learned how `DoEgen` takes the merged experiment plan and results data (from Chapter 5) and processes it to uncover valuable insights. 309 | 310 | We saw how to run the `doeval.py` script and what key analyses it performs: calculating accuracy (RMSE if `Y Truth` is available), determining which factors had the biggest impact (Factor Importance), identifying the best-performing settings, and examining correlations. We also explored the various `.csv` tables and `.png` plots generated, which provide a comprehensive summary of your experiment's findings. 311 | 312 | With this analysis complete, you have a much clearer picture of how your factors influence your outcomes. The plots generated provide powerful visual summaries. 313 | 314 | Now, let's take a closer look at these visualizations in the next chapter. We'll explore how to interpret the different plots `DoEgen` creates in more detail in [Chapter 7: Result Visualization 315 | ](07_result_visualization_.md). 316 | 317 | --- 318 | 319 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/07_result_visualization_.md: -------------------------------------------------------------------------------- 1 | # Chapter 7: Result Visualization 2 | 3 | In the previous chapter, [Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md), we learned how `DoEgen` analyzes the combined experiment plan and results data. It calculated important statistics like factor importance and RMSE, and identified top-performing experiments. This gave us valuable tables full of numbers. 4 | 5 | But sometimes, just looking at tables of numbers can be overwhelming. It's often much easier to understand patterns and trends by looking at pictures! This is where **Result Visualization** comes in. 6 | 7 | ## Why Pictures? Making Sense of Results Visually 8 | 9 | Imagine you just finished your car test drives (from our ongoing example). You have a spreadsheet showing the MPG for every combination of engine, tires, and driving style you tested. You also have the analysis results from Chapter 6, telling you which factors were most "important". 10 | 11 | Now, how do you easily *see* these findings? 12 | * How much *more* important was `Engine Type` compared to `Tire Pressure`? A bar chart makes this comparison instant. 13 | * Does higher `Tire Pressure` *always* lead to better MPG, or does it level off? A scatter plot with a trend line reveals the relationship. 14 | * Does the best `Engine Type` change if you also use `Sporty Tires`? A heatmap showing combinations helps spot these interactions. 15 | 16 | **Result Visualization** in `DoEgen` automatically creates these kinds of **charts and graphs** from your analysis results. It turns the numbers and tables from Chapter 6 into visual summaries, making it much faster and easier to grasp the key takeaways from your experiment. 17 | 18 | ## The Main Visual Tools `DoEgen` Provides 19 | 20 | `DoEgen` generates several types of plots automatically when you run the analysis step ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)). Let's look at the most common ones: 21 | 22 | 1. **Factor Importance Bar Chart (`Ybarplot_[Y_Label].png`)** 23 | * **What it shows:** Compares how much influence each factor had on the outcome (`Y Exp`). Longer bars mean the factor caused a bigger change in the results when its levels were varied. 24 | * **Why it's useful:** Quickly identifies the most impactful factors ("big hitters") and the least impactful ones. Helps you focus on what really matters. 25 | 26 | 2. **Factor vs. Outcome Correlation Plot (`Expresult_correlation_X-Y_[Y_Label].png`)** 27 | * **What it shows:** For each *numeric* factor, it plots the factor's value against the average outcome (`Y Exp Mean`). It also draws a line showing the general trend (linear regression). 28 | * **Why it's useful:** Helps understand the *direction* and *linearity* of the relationship. Does increasing the factor generally increase or decrease the outcome? Is the relationship roughly a straight line? 29 | 30 | 3. **Pairwise Factor Heatmap (`Y-pairwise-correlation_[Y_Label].png`)** 31 | * **What it shows:** A grid ("corner plot") where each square shows the interaction between *two* factors. The color in the square represents the average outcome (`Y Exp Mean`) when those two factors were set to specific levels. 32 | * **Why it's useful:** Excellent for spotting *interactions*. Does the effect of Factor A change depending on the level of Factor B? You can see which combinations lead to high or low outcomes. It works for both numeric and categorical factors. 33 | 34 | 4. **Top Performers Average Settings (`BestFactor_Avg[Y_Label].png`)** *(Only if `Y Truth` was provided)* 35 | * **What it shows:** If you calculated RMSE (accuracy), this plot shows the average settings of the factors for the most accurate (lowest RMSE) experiments. 36 | * **Why it's useful:** Gives a visual clue about the combination of settings that led to the most accurate results in your specific experiment. 37 | 38 | ## How are the Plots Generated? 39 | 40 | You don't need to run a separate command! These plots are created automatically as part of the [Result Analysis & Statistics](06_result_analysis___statistics_.md) process. When you run: 41 | 42 | ```bash 43 | python -m doegen.doeval settings_expresults.yaml 44 | ``` 45 | 46 | After calculating the statistics (like importance and correlations), `DoEgen` uses plotting libraries (`matplotlib` and `seaborn`) to generate these `.png` image files and saves them in your output directory (specified in `settings_expresults.yaml`), alongside the `.csv` tables from Chapter 6. 47 | 48 | ## Reading the Visual Story: Interpreting the Plots 49 | 50 | Let's learn how to read the main plots using the examples from the `DoEgen` documentation. 51 | 52 | **1. Factor Importance Bar Chart** 53 | 54 | ![Factor Importance ranked from maximum to lowest change (range) in Y](https://github.com/sebhaan/DoEgen/blob/main/figures/Ybarplot_1.png){width=600} 55 | 56 | * **How to read it:** 57 | * The **Y-axis** lists your factors (parameters). 58 | * The **X-axis** shows the "Importance" or "Range" of the outcome (Y). This is the difference between the average Y value at the factor's highest-impact level and its lowest-impact level. 59 | * **Longer bars** mean the factor had a bigger impact on the outcome during your experiment. Shorter bars mean it had less impact. 60 | * **Example interpretation:** In this plot, `Parameter6` has the longest bar, meaning changing its levels caused the largest variation in the measured outcome (Y). `Parameter1` and `Parameter8` had relatively small impacts. This instantly tells you where to focus your attention if you want to control the outcome Y. 61 | 62 | **2. Factor vs. Outcome Correlation Plot** 63 | 64 | ![Overview plot of X-Y Correlation for each factor as function of their level values. On top the linear regression coefficient `r` is shown along the linear regression fit and its uncertainty (line and shadow).](https://github.com/sebhaan/DoEgen/blob/main/figures/Expresult_correlation_X_1.png){width=600} 65 | 66 | * **How to read it:** 67 | * This shows a grid of smaller plots, one for each *numeric* factor. 68 | * In each small plot: 69 | * The **X-axis** is the value of the factor. 70 | * The **Y-axis** is the average outcome (`Y Exp Mean`). 71 | * The **dots** represent the average outcome measured at different levels of that factor. 72 | * The **blue line** shows the best straight-line fit through the dots (linear regression). 73 | * The **shaded blue area** shows the uncertainty in that fit. 74 | * The **`r` value** (top left) is the correlation coefficient. `r` close to 1 means strong positive linear correlation (line goes up), `r` close to -1 means strong negative linear correlation (line goes down), `r` close to 0 means weak or no linear correlation. 75 | * **Example interpretation:** 76 | * For `Parameter1`, the line is almost flat and `r` is close to 0, suggesting little linear relationship with Y. 77 | * For `Parameter3`, the line goes downwards and `r` is negative (around -0.5), suggesting that increasing `Parameter3` tends to decrease Y. 78 | * For `Parameter7`, the line goes upwards steeply and `r` is strongly positive (around 0.9), indicating a strong positive linear relationship between `Parameter7` and Y. 79 | 80 | **3. Pairwise Factor Heatmap (Corner Plot)** 81 | 82 | ![Cornerplot of pairwise factor relation with Y. The color(bar) indicates the value of Y.](https://github.com/sebhaan/DoEgen/blob/main/figures/Expresult_pairwise-correlation_1.png){width=600} 83 | 84 | * **How to read it:** 85 | * This is a grid showing interactions between pairs of factors. 86 | * Look at a specific square, for example, the one in the second row, first column. The **X-axis** corresponds to `Parameter1`, and the **Y-axis** corresponds to `Parameter2`. 87 | * The **colors** inside the square represent the average outcome (`Y Exp Mean`) observed for different combinations of `Parameter1` and `Parameter2` levels. 88 | * The **colorbar** on the right tells you what outcome value each color corresponds to (e.g., blue might be low Y, red might be high Y). 89 | * **Example interpretation:** By looking at the colors in the grid for `Parameter1` vs `Parameter2`, you can see if certain combinations lead to particularly high (red) or low (blue) values of Y. If the color pattern changes drastically across the square, it suggests an *interaction* - the effect of `Parameter1` on Y depends on the level of `Parameter2`. If you see similar plots for RMSE (`RMSE-pairwise-correlation...`), you can see which combinations led to more or less accurate results. 90 | 91 | ## What's Happening Under the Hood? (Simplified) 92 | 93 | Generating these plots involves using the results of the statistical analysis from Chapter 6. 94 | 95 | **Simplified Flow:** 96 | 97 | ```mermaid 98 | sequenceDiagram 99 | participant DEV as doeval.py (Main Script) 100 | participant CALC as calc_expresults_stats() (from Ch 6) 101 | participant PLT as Plotting Libraries (Matplotlib, Seaborn) 102 | participant PlotFuncs as Plotting Functions (plot_3dmap, plot_regression, etc.) 103 | participant Files as Output .png Files 104 | 105 | DEV->>CALC: Calculates statistics (Importance, Means, RMSE...) 106 | CALC-->>DEV: Returns stats / Saves CSVs 107 | 108 | DEV->>PlotFuncs: Calls plot_importance_bar(importance_data) 109 | PlotFuncs->>PLT: Uses Matplotlib to draw bars 110 | PLT-->>Files: Saves Ybarplot_[...].png 111 | 112 | DEV->>PlotFuncs: Calls plot_regression(merged_data) 113 | PlotFuncs->>PLT: Uses Seaborn regplot for each factor 114 | PLT-->>Files: Saves Expresult_correlation_X-Y_[...].png 115 | 116 | DEV->>PlotFuncs: Calls plot_3dmap(merged_data, 'Y Exp Mean') 117 | PlotFuncs->>PLT: Uses Pandas pivot_table & Seaborn heatmap 118 | PLT-->>Files: Saves Y-pairwise-correlation_[...].png 119 | 120 | Note right of DEV: Similar calls for RMSE plots if applicable. 121 | ``` 122 | 123 | The `doeval.py` script first calculates the necessary data using `calc_expresults_stats`. Then, it calls specific plotting functions (like `plot_3dmap`, `plot_regression`) defined within `doeval.py`. These functions take the calculated data (often stored in pandas DataFrames) and use commands from the `matplotlib` and `seaborn` libraries to draw the actual plots and save them as `.png` files. 124 | 125 | **Simplified Code Snippets (`doegen/doeval.py`):** 126 | 127 | Let's look at tiny snippets to get the idea. 128 | 129 | 1. **Factor Importance Bar Plot (inside `calc_expresults_stats`)** 130 | 131 | ```python 132 | # Simplified view from doegen/doeval.py - inside calc_expresults_stats 133 | 134 | import matplotlib.pyplot as plt 135 | import pandas as pd 136 | import numpy as np 137 | 138 | def calc_expresults_stats(ylabels, dfdes, dfres, outpath): 139 | params = list(dfdes)[1:] # Factor names 140 | npar = len(params) 141 | 142 | for ylabel in ylabels: 143 | # ... (calculate factor importance 'width' as shown in Ch 6) ... 144 | # df_importance = pd.DataFrame(...) # Contains 'Yrange' for each factor 145 | 146 | # --- Plotting Part --- 147 | plt.ioff() # Turn off interactive display 148 | plt.figure(figsize=(8, 5)) 149 | # Sort factors by importance (range) 150 | df_sorted = df_importance.sort_values('Yrange') 151 | # Create horizontal bar plot 152 | plt.barh( 153 | df_sorted.index, # Factor names on Y-axis 154 | width=df_sorted['Yrange'], # Bar length based on importance 155 | # left=ymin_par[sort], # Optional: show min value start 156 | color="red", 157 | ) 158 | plt.title(f"Factor Importance (Range) for {ylabel}") 159 | plt.xlabel("Change in Y (Range)") 160 | plt.tight_layout() # Adjust spacing 161 | # Save the plot 162 | plt.savefig(outpath / f"Ybarplot_{ylabel}.png", dpi=300) 163 | plt.close() # Close the plot figure 164 | # ... (rest of the stats calculation) ... 165 | ``` 166 | * This code uses `matplotlib.pyplot` (imported as `plt`) to create a horizontal bar chart (`plt.barh`) using the calculated importance range (`Yrange`). It saves the figure using `plt.savefig`. 167 | 168 | 2. **Factor vs. Outcome Correlation Plot (`plot_regression`)** 169 | 170 | ```python 171 | # Simplified view from doegen/doeval.py - plot_regression function 172 | 173 | import matplotlib.pyplot as plt 174 | import seaborn as sns 175 | import pandas as pd 176 | import numpy as np 177 | 178 | def plot_regression(df, params, target_name, fname_out): 179 | """Creates Correlation plot.""" 180 | # Select only numeric columns from the factor list 181 | numeric_params = df[params].select_dtypes(include=np.number).columns 182 | nfac = len(numeric_params) 183 | # Determine grid layout for subplots 184 | nax1 = int(np.sqrt(nfac)) 185 | nax2 = int(np.ceil(nfac / nax1)) 186 | 187 | plt.ioff() 188 | fig = plt.figure(figsize=(nax1 * 4, nax2 * 3)) # Adjust figure size 189 | fig.suptitle(f"Factor vs. {target_name} Correlation", y=1.02) # Add title 190 | 191 | for i, param_name in enumerate(numeric_params): 192 | # Create a subplot in the grid 193 | ax = fig.add_subplot(nax2, nax1, i + 1) 194 | # Use seaborn's regplot to create scatter + regression line 195 | sns.regplot(x=param_name, y=target_name, data=df, ax=ax, 196 | scatter_kws={'s': 10}, # Smaller points 197 | line_kws={'lw': 1}) # Thinner line 198 | # Calculate correlation coefficient 199 | r = df[param_name].corr(df[target_name]) 200 | # Add correlation value text to the plot 201 | ax.annotate(f"r = {r:.2f}", xy=(0.05, 0.9), xycoords='axes fraction') 202 | ax.set_title(param_name) # Set title for subplot 203 | 204 | plt.tight_layout(rect=[0, 0, 1, 0.98]) # Adjust layout to prevent title overlap 205 | plt.savefig(fname_out, dpi=300) 206 | plt.close() 207 | ``` 208 | * This function uses the `seaborn` library (`sns.regplot`) which is built on top of `matplotlib`. `regplot` automatically creates the scatter plot and fits/draws the regression line for each numeric factor in a loop, placing each one in a subplot grid. 209 | 210 | 3. **Pairwise Heatmap (`plot_3dmap`)** 211 | 212 | ```python 213 | # Simplified view from doegen/doeval.py - plot_3dmap function 214 | 215 | import matplotlib.pyplot as plt 216 | import seaborn as sns 217 | import pandas as pd 218 | import numpy as np 219 | 220 | def plot_3dmap(df, params, target_name, fname_out): 221 | """Plots pairwise heatmap (corner plot).""" 222 | nfac = len(params) 223 | # Find overall min/max of the target for consistent color scaling 224 | vmin = df[target_name].min() 225 | vmax = df[target_name].max() 226 | 227 | plt.ioff() 228 | fig, axs = plt.subplots(nfac - 1, nfac - 1, figsize=(nfac * 1.5, nfac * 1.5)) # Smaller figsize 229 | fig.suptitle(f"Pairwise Factor Heatmap for {target_name}", y=1.02) 230 | 231 | for i in range(nfac - 1): # Index for columns (X-axis factor) 232 | for j in range(i + 1, nfac): # Index for rows (Y-axis factor) 233 | ax = axs[j - 1, i] # Select the correct subplot 234 | try: 235 | # Create a pivot table: average target value for each combination 236 | pivot_data = pd.pivot_table( 237 | df, values=target_name, index=[params[j]], columns=[params[i]], 238 | aggfunc=np.nanmean # Use mean as the aggregation 239 | ) 240 | # Draw the heatmap using seaborn 241 | sns.heatmap( 242 | pivot_data, cmap="viridis", # Use a different colormap 243 | annot=False, ax=ax, # No annotations for cleaner look 244 | vmin=vmin, vmax=vmax, # Consistent color scale 245 | square=True, cbar=False # Make squares, no individual color bars 246 | ) 247 | except Exception as e: 248 | # Handle cases where pivot might fail (e.g., insufficient data) 249 | print(f"Could not create heatmap for {params[i]} vs {params[j]}: {e}") 250 | sns.heatmap(pd.DataFrame(), ax=ax, cbar=False) # Draw empty plot 251 | 252 | # Clean up axes labels for inner plots 253 | ax.set_xlabel(params[i] if j == nfac -1 else "") 254 | ax.set_ylabel(params[j] if i == 0 else "") 255 | if i > 0: ax.set_yticklabels([]) 256 | if j < nfac - 1: ax.set_xticklabels([]) 257 | 258 | # Hide unused upper triangle plots 259 | for i in range(nfac - 1): 260 | for j in range(i): 261 | axs[j, i].set_visible(False) 262 | 263 | # Add a single color bar for the whole plot 264 | fig.colorbar(axs[1, 0].collections[0], ax=axs[:, -1], location='right', shrink=0.6) 265 | 266 | plt.tight_layout(rect=[0, 0, 0.9, 0.98]) # Adjust layout 267 | plt.savefig(fname_out, dpi=300) 268 | plt.close() 269 | 270 | ``` 271 | * This function iterates through pairs of factors. For each pair, it uses `pandas.pivot_table` to aggregate the `target_name` (e.g., `Y Exp Mean`) for each combination of levels. Then, it uses `seaborn.heatmap` to draw the colored grid representing these average values. 272 | 273 | ## Conclusion 274 | 275 | In this chapter, we explored **Result Visualization**. We saw that `DoEgen` automatically generates various plots like bar charts, correlation plots, and heatmaps during the result analysis phase ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)). 276 | 277 | These visualizations are incredibly helpful because they turn complex tables of numbers into easy-to-understand pictures. We learned how to interpret the key plots to quickly grasp factor importance, understand relationships between factors and outcomes, and identify potential interactions. These visual insights are crucial for communicating your experimental findings effectively. 278 | 279 | We've now walked through the main workflow of `DoEgen`, from defining the experiment to visualizing the results. Throughout this process, we've mentioned using configuration files (like `settings_design.yaml` and `settings_expresults.yaml`) to control `DoEgen`'s behavior. Let's wrap up by looking more closely at how these files work in the final chapter. 280 | 281 | Next up: [Chapter 8: Configuration Handling](08_configuration_handling_.md). 282 | 283 | --- 284 | 285 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/08_configuration_handling_.md: -------------------------------------------------------------------------------- 1 | # Chapter 8: Configuration Handling 2 | 3 | In [Chapter 7: Result Visualization](07_result_visualization_.md), we saw how `DoEgen` turns our analysis results into easy-to-understand plots. Throughout the previous chapters, we've hinted at using "settings files" (like `settings_design.yaml`) to tell `DoEgen` things like where our input files are located or where to save the output. This final chapter dives into how this **Configuration Handling** works. 4 | 5 | ## Why Do We Need Settings Files? 6 | 7 | Imagine you have a TV. You wouldn't want to open up the back and rewire it every time you wanted to change the channel, adjust the volume, or switch the input source! Instead, you use a **remote control** or a **settings menu**. 8 | 9 | **Configuration Handling** in `DoEgen` is exactly like that settings menu or remote control. It provides a way to tell `DoEgen` *how* to run without changing the core Python code itself. This is super important because: 10 | 11 | * **Flexibility:** You might want to run `DoEgen` on different experiments with different input files (`Experiment_setup.xlsx`) located in different folders. 12 | * **Customization:** You might want to change how long `DoEgen` spends searching for the best design (`maxtime_per_run`) or the maximum number of experiments (`nrun_max`) it should consider. 13 | * **Reusability:** You can easily reuse the same `DoEgen` code for various projects just by changing the settings file. 14 | 15 | Instead of hard-coding file paths or run limits directly into the Python scripts (which would be like rewiring the TV!), `DoEgen` reads these settings from simple text files when it starts up. 16 | 17 | ## What is Configuration Handling in `DoEgen`? 18 | 19 | Configuration Handling is the system `DoEgen` uses to manage all its operational settings. Think of it as the **central control panel** for the tool. It controls things like: 20 | 21 | * **Input Files:** Where is the `Experiment_setup.xlsx` file? Where is the `Experiment_results.xlsx` file? 22 | * **Output Location:** Where should `DoEgen` save the generated designs, efficiency reports, analysis results, and plots? 23 | * **Design Generation Parameters:** What's the minimum (`nrun_min`) and maximum (`nrun_max`) number of experiment runs to try generating? How big should the steps (`delta_nrun`) be between run sizes? How much time (`maxtime_per_run`) should be spent optimizing each run size? 24 | * **Analysis Options:** (Though less configurable in the current version, future versions might add options here). 25 | 26 | ## The Tool: Simple YAML Files 27 | 28 | `DoEgen` uses a simple, human-readable file format called **YAML** (often pronounced "yam-ul") for its configuration files. YAML files typically end with the `.yaml` or `.yml` extension. 29 | 30 | YAML is designed to be easy for both humans to read and write, and for computers to parse. It uses indentation (spaces) and key-value pairs separated by a colon (`:`). 31 | 32 | Here's a tiny example of what YAML looks like: 33 | 34 | ```yaml 35 | # This is a comment 36 | project_name: My Cake Baking Experiment 37 | output_directory: /path/to/my/results/ 38 | max_runs: 50 39 | use_feature_x: true 40 | ``` 41 | 42 | This is much easier to understand than complex code! 43 | 44 | ## `DoEgen`'s Settings Files 45 | 46 | `DoEgen` typically uses two main settings files: 47 | 48 | 1. **`settings_design.yaml`:** Used when running the design generation step ([Chapter 2: Design Generation](02_design_generation_.md)). 49 | 2. **`settings_expresults.yaml`:** Used when running the result analysis step ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)). 50 | 51 | You can create default template files using the command: `python -m doegen.init_config` 52 | 53 | Let's look at simplified examples of what you might find in these files: 54 | 55 | **Example: `settings_design.yaml` (Simplified)** 56 | 57 | ```yaml 58 | # Settings for Design Generation (doegen.doegen) 59 | 60 | # --- Input --- 61 | # Path to the folder containing the setup file 62 | path: 'test/input/' 63 | # Name of the Excel setup file 64 | fname_setup: 'Experiment_setup.xlsx' 65 | 66 | # --- Output --- 67 | # Path where all output folders and files will be saved 68 | outpath: 'test/output/' 69 | 70 | # --- Design Generation Parameters --- 71 | # Maximum number of runs to generate a design for 72 | nrun_max: 150 73 | # Minimum number of runs to start generating from (can be None to let DoEgen calculate) 74 | nrun_min: None 75 | # Step size between run numbers (e.g., generate for 12, 18, 24... if delta=6) 76 | delta_nrun: 6 77 | # Maximum time (seconds) to spend optimizing for EACH run size 78 | maxtime_per_run: 100 79 | ``` 80 | 81 | * **Explanation:** This tells `DoEgen` where to find the `Experiment_setup.xlsx` file, where to save everything (`test/output/`), and the parameters controlling the search for designs (generate designs from a calculated minimum up to 150 runs, in steps of 6, spending up to 100 seconds on each size). 82 | 83 | **Example: `settings_expresults.yaml` (Simplified)** 84 | 85 | ```yaml 86 | # Settings for Result Analysis (doegen.doeval) 87 | 88 | # --- Input --- 89 | # Path containing the results and the specific design table used 90 | inpath: 'test/output/' 91 | # Name of the Excel results file you filled in 92 | fname_results: 'Experiment_results_Nrun72.xlsx' 93 | # Name of the specific design table CSV file you actually used for experiments 94 | fname_design: 'Designtable_optimal_Nrun72.csv' 95 | 96 | # --- Output --- 97 | # Path where analysis results (tables, plots) will be saved 98 | outpath: 'test/expresults/' 99 | ``` 100 | 101 | * **Explanation:** This tells the analysis script (`doeval`) where to find the results file (`Experiment_results_Nrun72.xlsx`) and the corresponding design table (`Designtable_optimal_Nrun72.csv`). It also specifies that the analysis output should go into the `test/expresults/` folder. 102 | 103 | ## How `DoEgen` Uses the Settings 104 | 105 | When you run a `DoEgen` command, you tell it which settings file to use: 106 | 107 | ```bash 108 | # Running design generation 109 | python -m doegen.doegen settings_design.yaml 110 | 111 | # Running result analysis 112 | python -m doegen.doeval settings_expresults.yaml 113 | ``` 114 | 115 | The script (`doegen.py` or `doeval.py`) starts by: 116 | 1. Looking at the command-line argument (`settings_design.yaml` or `settings_expresults.yaml`). 117 | 2. Opening and reading that specific YAML file. 118 | 3. Loading the settings (like `outpath`, `nrun_max`, etc.) into memory. 119 | 4. Using these loaded values throughout its execution to know where files are, how many runs to generate, etc. 120 | 121 | ## Creating and Editing Settings Files 122 | 123 | * **Templates:** `DoEgen` provides template YAML files (you can generate them with `python -m doegen.init_config` or find them in the installation). 124 | * **Editing:** You just need a simple text editor (like Notepad, VS Code, Sublime Text, etc.) to open the `.yaml` file and change the values after the colons (`:`) to match your file locations and desired parameters. 125 | * **Indentation:** Be careful with indentation (spaces at the beginning of lines) if you add more complex structures, as YAML uses it to understand the file structure. For simple key-value pairs like in the examples, indentation is usually not an issue. 126 | 127 | ## How It Works Under the Hood 128 | 129 | `DoEgen` uses standard Python libraries to handle configuration files. 130 | 131 | **High-Level Steps:** 132 | 133 | 1. You run the script, providing the path to your `.yaml` settings file. 134 | 2. The script uses Python's `argparse` library to get the settings file path from the command line. 135 | 3. It uses the `PyYAML` library (or `yaml` for short) to open and read the YAML file. 136 | 4. The `yaml` library parses the text, understanding the `key: value` pairs and the structure. 137 | 5. It converts the YAML content into a Python data structure (usually a dictionary). 138 | 6. The main `DoEgen` script can then easily access the settings by looking up keys in this dictionary (e.g., get the value associated with the key `outpath`). 139 | 140 | **Sequence Diagram:** 141 | 142 | ```mermaid 143 | sequenceDiagram 144 | participant U as User 145 | participant Script as DoEgen Script (e.g., doegen.py) 146 | participant ArgParse as Argument Parser 147 | participant YAMLlib as PyYAML Library 148 | participant Settings as Settings Dictionary 149 | 150 | U->>Script: Runs `python -m doegen.doegen settings_design.yaml` 151 | Script->>ArgParse: Gets settings file path ('settings_design.yaml') 152 | ArgParse-->>Script: Returns the path 153 | Script->>YAMLlib: Asks PyYAML to load the file at the path 154 | YAMLlib->>YAMLlib: Opens and parses 'settings_design.yaml' 155 | YAMLlib-->>Script: Returns settings as a Python dictionary 156 | Script->>Settings: Stores the loaded settings 157 | Script->>Settings: Accesses values (e.g., Settings['outpath']) during execution 158 | ``` 159 | 160 | **Code Snippet (Simplified from `configloader.py`):** 161 | 162 | This is how `DoEgen` might load the settings file at the very beginning. 163 | 164 | ```python 165 | # Simplified from doegen/configloader.py or doegen/doegen.py main_cli 166 | 167 | import argparse # Library to handle command-line arguments 168 | import yaml # Library to read YAML files 169 | 170 | def load_settings(default_path='settings_design.yaml'): 171 | """Loads settings from a YAML file specified on the command line.""" 172 | 173 | # 1. Set up to read the file path from the command line 174 | parser = argparse.ArgumentParser() 175 | # Allow the user to specify a settings file path, or use the default 176 | parser.add_argument('settings_path', nargs='?', default=default_path) 177 | args = parser.parse_args() # Get the arguments provided by the user 178 | 179 | settings_file_path = args.settings_path 180 | print(f"Using settings from: {settings_file_path}") 181 | 182 | try: 183 | # 2. Open and read the YAML file 184 | with open(settings_file_path) as f: 185 | # Use yaml.safe_load to parse the file safely 186 | cfg = yaml.safe_load(f) 187 | 188 | # 3. Return the loaded settings (as a dictionary) 189 | print("Settings loaded successfully.") 190 | return cfg 191 | 192 | except FileNotFoundError: 193 | print(f"Error: Settings file not found at {settings_file_path}") 194 | return None 195 | except Exception as e: 196 | print(f"Error loading settings file: {e}") 197 | return None 198 | 199 | # --- How the main script might use this --- 200 | # settings = load_settings() 201 | # if settings: 202 | # # Now access values like: 203 | # output_folder = settings['outpath'] 204 | # max_runs = settings['nrun_max'] 205 | # # ... use these values in the rest of the script ... 206 | # else: 207 | # print("Could not load settings. Exiting.") 208 | ``` 209 | 210 | * **Explanation:** This code snippet first uses `argparse` to figure out which settings file the user wants to load (getting the path from the command line). Then, it opens that file and uses `yaml.safe_load()` to read the content and convert it into a Python dictionary called `cfg`. The main script can then use this `cfg` dictionary to get the values for `outpath`, `nrun_max`, etc. 211 | 212 | ## Benefits of Using Configuration Files 213 | 214 | * **Easy Customization:** Change file paths, run limits, etc., without touching the code. 215 | * **Separation of Concerns:** Keeps the "what to do" (code logic) separate from the "how to do it specifically this time" (settings). 216 | * **Reproducibility:** Save your settings file along with your results to remember exactly how an analysis was run. 217 | * **Sharing:** Share settings files easily with collaborators. 218 | 219 | ## Conclusion 220 | 221 | This chapter explored **Configuration Handling** in `DoEgen`. We learned that `DoEgen` uses simple, human-readable **YAML files** (like `settings_design.yaml` and `settings_expresults.yaml`) as its "settings panel" or "remote control". These files allow you to easily specify input/output paths, run limits, and other parameters without modifying the core Python code. 222 | 223 | We saw examples of these files, how `DoEgen` reads them using the `PyYAML` library, and why this approach is beneficial for flexibility and reproducibility. 224 | 225 | This concludes the main tutorial chapters for `DoEgen`! We've journeyed from defining an experiment ([Chapter 1: Experiment Setup Definition](01_experiment_setup_definition_.md)), generating and evaluating designs ([Chapter 2: Design Generation](02_design_generation_.md), [Chapter 3: Design Evaluation & Efficiency Metrics](03_design_evaluation___efficiency_metrics_.md)), selecting the best plan ([Chapter 4: Design Selection](04_design_selection_.md)), inputting results ([Chapter 5: Experiment Result Input & Merging](05_experiment_result_input___merging_.md)), analyzing the outcomes ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)), visualizing the findings ([Chapter 7: Result Visualization](07_result_visualization_.md)), and finally, understanding how to configure the tool's operation ([Chapter 8: Configuration Handling](08_configuration_handling_.md)). 226 | 227 | We hope this tutorial provides a solid foundation for using `DoEgen` to design and analyze your own experiments effectively. Happy experimenting! 228 | 229 | --- 230 | 231 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) -------------------------------------------------------------------------------- /docs/DoEgen_explained/index.md: -------------------------------------------------------------------------------- 1 | # Tutorial: DoEgen 2 | 3 | `DoEgen` is a Python tool designed to help researchers with **Design of Experiments (DoE)**. 4 | It *automates* the creation of efficient experimental plans (*design generation*), helps evaluate how *good* these plans are (*design evaluation*), and suggests the best plans to use (*design selection*). 5 | After running the experiments, `DoEgen` can take the results, combine them with the plan, and perform *statistical analysis* and create *visualizations* to understand which factors are most important and what settings yield the best outcomes. 6 | It uses simple Excel templates for defining experiments and inputting results, and YAML files for configuration. 7 | 8 | 9 | **Source Repository:** [https://github.com/sebhaan/DoEgen](https://github.com/sebhaan/DoEgen) 10 | 11 | ```mermaid 12 | flowchart TD 13 | A0["Experiment Setup Definition 14 | "] 15 | A1["Design Generation 16 | "] 17 | A2["Design Evaluation & Efficiency Metrics 18 | "] 19 | A3["Design Selection 20 | "] 21 | A4["Experiment Result Input & Merging 22 | "] 23 | A5["Result Analysis & Statistics 24 | "] 25 | A6["Result Visualization 26 | "] 27 | A7["Configuration Handling 28 | "] 29 | A0 -- "Defines inputs for" --> A1 30 | A1 -- "Provides design for" --> A2 31 | A2 -- "Provides metrics for" --> A3 32 | A1 -- "Provides design for" --> A4 33 | A4 -- "Provides merged data for" --> A5 34 | A5 -- "Provides data for" --> A6 35 | A7 -- "Configures" --> A1 36 | A7 -- "Configures" --> A5 37 | ``` 38 | 39 | ## Chapters 40 | 41 | 1. [Experiment Setup Definition 42 | ](01_experiment_setup_definition_.md) 43 | 2. [Design Generation 44 | ](02_design_generation_.md) 45 | 3. [Design Evaluation & Efficiency Metrics 46 | ](03_design_evaluation___efficiency_metrics_.md) 47 | 4. [Design Selection 48 | ](04_design_selection_.md) 49 | 5. [Experiment Result Input & Merging 50 | ](05_experiment_result_input___merging_.md) 51 | 6. [Result Analysis & Statistics 52 | ](06_result_analysis___statistics_.md) 53 | 7. [Result Visualization 54 | ](07_result_visualization_.md) 55 | 8. [Configuration Handling 56 | ](08_configuration_handling_.md) 57 | 58 | 59 | --- 60 | 61 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge) 62 | -------------------------------------------------------------------------------- /docs/MANUAL.md: -------------------------------------------------------------------------------- 1 | # DoEgen: A Python Library for Optimised Design of Experiment Generation and Evaluation 2 | 3 | DoEgen is a Python library aiming to assist in generating optimised Design of Experiments (DoE), evaluating design efficiencies, and analysing experiment results. 4 | 5 | In a first step, optimised designs can be automatically generated and efficiencies evaluated for any mixture of factor-levels for numeric and categorical factors. Designs are automatically evaluated as function of number of experiment runs and the most efficient designs are suggested. In particular DoEgen provides computation of a wide range of design efficiencies and allows to import and evaluate externally generated designs as well. 6 | 7 | The second part of DoEgen assists in analysing any derived experiment results in terms of factor importance, correlations, and response analysis for best parameter space selection. 8 | 9 | Written by Sebastian Haan (Sydney Informatics Hub, The University of Sydney) 10 | 11 | 12 | ## Table of Contents 13 | - [Definitions](#definitions) 14 | - [Functionality](#functionality) 15 | - [Installation And Requirements](#installation-and-requirements) 16 | - [Requirements](#requirements) 17 | - [User Templates](#user-templates) 18 | - [Running tests](#running-tests) 19 | - [Documentation](#documentation) 20 | - [Main Modules and Usage](#main-modules-and-usage) 21 | - [Design Generation](#design-generation) 22 | - [Design Efficiencies](#design-efficiencies) 23 | - [Design Selection](#design-selection) 24 | - [Experiment Result Analysis](#experiment-result-analysis) 25 | - [Use Case Study](#use-case-study) 26 | - [Comparison to Other DoE Tools](#comparison-to-other-doe-tools) 27 | - [Literature](#literature) 28 | - [Attribution and Acknowledgments](#attribution-and-acknowledgements) 29 | - [License](#license) 30 | 31 | ## Definitions 32 | 33 | An Experiment Design is typically defined by: 34 | 35 | * Number of Factors: the parameters or variates of the experiment 36 | * Number of Runs: the number of experiments 37 | * Levels: The number of value options for each factor, which can be either numeric values (discrete or continuous) or categorical. Discrete levels for continuous factors can be obtained by providing the minimum and maximum of the factor range and the number of levels. The more levels, the more "fine-grained" the experiment will evaluate this factor, but also more experimental runs are required. 38 | 39 | The goal of optimising an experimental design is to provide an efficient design that is near-optimal in terms of, e.g., orthogonality, level balance, and two-way interaction coverage, yet can be performed with a minimum number of experimental runs, which are often costly or time-consuming. 40 | 41 | 42 | ## Functionality 43 | 44 | If you would like to jumpstart a new experiment and to skip the technical details, you can find a summary of the main usage of DoEgen in [Case Study Use Case]. 45 | 46 | Currently, the (preliminary) release contains several functions for generating and evaluating designs. Importing and evaluating external designs is supported (e.g. for comparison to other DoE generator tools). DoE also implements several functions for experiment result analysis and visualisation of parameter space. 47 | 48 | The main functionalities are (sorted in order of typical experiment process): 49 | 50 | * Reading Experiment Setup Table and Settings (Parameter Name, Levels for each factor, Maximum number of runs, Min/Max etc) 51 | * Generating optimised design arrays for a range of runs (given maximum number of runs, and optional computation-time constraints, see `settings_design.yaml`). 52 | * Evaluation and visualisation of more than ten design efficiencies such as level balance, orthogonality, D-efficiencies etc (see [Design Efficiencies] for the complete list). 53 | * Automatic suggestion of minimum, optimal, and best designs within a given range of experiment runs. 54 | * Import and evaluation of externally generated design arrays. 55 | * Experiment result analysis: Template table for experiment results, multi-variant RMSE computation, best model/parameter selection, Factor Importance computation, pairwise response surface and correlation computation, factor correlation analysis and Two-way interaction response plots. 56 | * Visualisation of experiment results. 57 | 58 | 59 | ## Installation And Requirements 60 | 61 | 62 | ### Requirements 63 | 64 | - Python >= 3.6 65 | - SWIG >=3.0.12 66 | - OApackage 67 | - xlrd 68 | - XlsxWriter 69 | - Numpy 70 | - Pandas 71 | - PyYAML 72 | - scikit-learn 73 | - matplotlib 74 | - seaborn 75 | 76 | The DoEgen package is currently considered experimental and has been tested with the libraries specified in `requirements.txt`. 77 | 78 | The OApackage requires an installation of SWIG, which can be found at https://www.dev2qa.com/how-to-install-swig-on-macos-linux-and-windows/ or can be installed via conda 79 | 80 | ```sh 81 | conda install swig 82 | ``` 83 | 84 | After installing `swig` and `numpy`, DoEgen can be installed with 85 | 86 | ``` sh 87 | python setup.py build 88 | python setup.py install 89 | ``` 90 | 91 | Note that OAPackage can be also installed manually by following installation instructions and documentation for OApackage (tested with 92 | OApackage 2.6.6), which can be found at https://pypi.org/project/OApackage/. 93 | 94 | 95 | ### User Templates 96 | 97 | 1) The factor (parameter) settings of experiment are defined in an experiment setup table (see `Experiment_results_template.xlsx`). A new excel setup template table can be also created with `create_setupfile.py`. 98 | Each factor is on a new row and specified by `Parameter Name`, `Parameter Type` , `Level Number`, `Minimum`, `Maximum` 99 | 100 | 2) After the experiment is run, the results have to be filled in an experiment result table (see `Experiment_results_template.xlsx`). A new excel result template table can be also created with `create_resultfile.py` 101 | The result table allows to fill in multiple output properties (Y_label: output target to be predicted) and experiment positions. The results have to be provided in the table with the following columns: 102 | 103 | * `Nexp`: Run# of experiment, need to match Run# in Experiment setup and design. 104 | * `PID`: Identifier# of label of location (point) in experiment (e.g. if experiment is run at different locations simultaneously). 105 | * `Y Label`: Identifier# or label of Y-Variate (target property that has to be predicted or evaluated, e.g. Rain and Temperature). This allows to include multi-output models with distinct target properties. Note that currently each Y variate is evaluated separately. 106 | * `Y Exp` The experiment result for Y 107 | * `Y Truth` (optional) if the true value available is available for Y. This is required to calculate the RMSE and to select best parameter space. 108 | * Not currently considered (yet) in result stats computation: `Std Y Exp`, `Std Y Truth`, `Weight PID` 109 | 110 | ![Experiment Setup Table Header.](figures/Setup_header.png){width=600} 111 | 112 | 113 | ![Experiment Result Table Header.](figures/Result_header.png){width=600} 114 | 115 | 116 | ### Running Tests 117 | 118 | To verify that DoEgen works, you can run the example experiment 119 | 120 | ``` bash 121 | $ python -m doegen.init_tests 122 | $ python -m doegen.doegen test/settings_design_test.yaml 123 | $ python -m doegen.doeval test/settings_expresults_test.yaml 124 | ``` 125 | 126 | ### Documentation 127 | 128 | Please do not modify `README.md`. Instead make any changes in the master documentation file `MANUAL.md` (uses pandoc markdown syntax) and then convert to the inferior Github markdown flavor (note that the new github-flavored markdown format gfm option does not correctly solve figure caption and resize options): 129 | ```bash 130 | pandoc -f markdown -t markdown_github MANUAL.md -o README.md 131 | ``` 132 | and to pdf: 133 | ```bash 134 | pandoc -V geometry:margin=1.2in MANUAL.md -o docs/MANUAL.pdf 135 | ``` 136 | or as standalone html: 137 | ```bash 138 | pandoc MANUAL.md -o MANUAL.html 139 | ``` 140 | 141 | ## Main Modules and Usage 142 | 143 | 144 | ### Design Generation 145 | Design generation with `doegen.py`: 146 | Main model for generating optimised designs and computation of efficiencies. 147 | Settings are specified in settings yaml file `settings_design.yaml`. 148 | If the yaml and .xlsx template files are not yet in your working directory (e.g. after first doegen installation), you can create in the the yaml and excel template files with 149 | 150 | ``` bash 151 | $ python -m doegen.init_config 152 | ``` 153 | 154 | Before running `doegen.py`,two things have to be the done: 155 | 156 | 1) fill in experiment setup table (see template provided `Experiment_setup_template.xlsx` or example in `test/` folder) 157 | 2) provide settings in settings file (see `settings_design.yaml`) 158 | 159 | Now you are ready to run the design generation 160 | 161 | ``` bash 162 | $ python -m doegen.doegen settings_design.yaml 163 | ``` 164 | 165 | This will produce a number of files for different experiment run length (see folder `test/results/DesignArray_Nrun...`): 166 | 167 | * The optimised design array `EDarray_[factor_levelels]_Nrun.csv`. 168 | * A table of design efficiencies `Efficiencies_[factor_levelels]_Nrun.csv` 169 | * Table of Canonical Correlation Coefficients `Table_Canonical_Correlation.csv` 170 | * Table of two-way Interaction balance `Table_Interaction_Balance.txt` 171 | * Table of Pearson correlation coefficients between all factor pairs `Table_Pearson_Correlation.csv` 172 | * Plot of pairwise correlation including regression fit `pairwise_correlation.png` (see example plot below) 173 | 174 | Besides the default optimisation (based on function `doegen.deogen.optimize_design`), DoEgen also allows the to construct full orthogonal designs using the function `doegen.doegen.gen_highD`, which is based on OApackage orthogonal arrays and extensions. However, this works only for special cases with limited number of factors and design levels. Thus, it is currently not fully automated but might assist advanced users to construct optimal designs. 175 | 176 | 177 | ### Design Selection 178 | 179 | DoEgen will select by default three designs based on the following citeria: 180 | 181 | 1) minimum Design with the criteria: 182 | 183 | * number of runs >= number of factors + 1 184 | * center balance > 95% 185 | * level balance > 95% 186 | * Orthogonal Balance > 90% 187 | * Two Level interaction Balance > 90% 188 | * Two Level Interaction Minimum One = 100% 189 | 190 | 2) optimal Design with the criteria: 191 | 192 | * center balance > 98% 193 | * level balance > 98% 194 | * Orthogonal Balance > 95% 195 | * Two Level interaction Balance > 95% 196 | * Two Level Interaction Minimum One = 100% 197 | 198 | 3) best design which is based on best score that is sum of efficiencies above and includes a small penalty for runsize relative to maximum runsize 199 | 200 | This will deliver (see folder `test/results/`): 201 | 202 | * Overview summary of the three designs and their main efficiencies: `Experiment_Design_selection_summary.txt` 203 | * Three tables (`Designtable_minimum/optimal/best...csv`) for the there suggested designs that are converted in the actual level values 204 | * An overview of the efficiencies is plotted as function of exp run and saved in `Efficiencies_[factor_levels].png` 205 | 206 | In case the user wants to select another design for a different run size, one can covert the design array into a design table with the function `doegen.deogen.array2valuetable()`. 207 | 208 | ![Example overview plot of the main efficiencies (from 0=worst to 100=best) as function of number of experiments.](figures/Efficiencies.png){width=400} 209 | 210 | 211 | ### Design Efficiencies 212 | 213 | DoEgen computes more than ten efficiencies and saves them as .csv file for each generated design array. 214 | All indicators, except for the canonical correlations, have a range from 0 (worst possible) to 1 (optimal): 215 | 216 | * Center Balance: 100% [1 - Sum(Center-Deviation)/Array Size], i.e. the average center balance over all factors. 217 | * Level Balance: Defined as 100% [1 - Sum(Imbalance)/Array Size], the average level balance over all factors. 218 | * Orthogonality: Defined as 100% [1 - Orthogonality], i.e. the average orthogonality over all factor pairs. 219 | * Two-way Interaction Balance: Similar to level balance but for pairwise factor balance. 220 | * Two-way Interaction with at least one occurrence: 100% [1 - Sum(Not at least one pairwise factor occurrence)/number of pairwise combinations]; 100% if all factor-level pair combinations occur at least once. 221 | * D-Eff: D-Efficiency (model includes main term and quadratic). 222 | * D1 Eff: only main terms 223 | * D2 Eff: main, quadratic, and interaction terms 224 | * A-Eff: A-efficiency (main term and quadratic) 225 | * A1-Eff: only main terms 226 | * A2-Eff: main, quadratic, and interaction terms 227 | * Acor_can_avg: average canonical correlation efficiency 228 | * Acor_can_max: maximal canonical correlation coefficient 229 | 230 | For further inspection, `doegen.deogen.evaluate_design2` creates also the following tables and plots: 231 | 232 | * Table of Canonical Correlation 233 | * Table of Pearson Correlation (same as above if normalised discrete variables) 234 | * Table of Two-way Interaction Balance 235 | * Cornerplot of pairwise factor relation with Y 236 | 237 | ![Pairwise factor correlation plot of an example 8 factor design array with a mix of 3- and 2-level factors. The lines and blue shadows correspond to the linear regression fit and its uncertainty. Two pairs are 100% orthogonal if the linear regression line is horizontal. The diagonal bar charts show the histogram of level values for each factor (perfect level balance if histogram is flat).](figures/pairwise_correlation.png){width=600} 238 | 239 | 240 | 241 | ### Experiment Result Analysis 242 | 243 | Experiment Result Analysis with `doeval.py`: 244 | The experiment results have to be provided in a result table with the format as specified in #user-templates, and specifications in the `settings_expresults.yaml` file. 245 | Then run 246 | ``` bash 247 | $ python -m doegen.doeval settings_expresults.yaml 248 | ``` 249 | This will create the following stats tables and plots (see folder `test/expresults/` as example): 250 | 251 | * A valuation of the factors in term of "importance", which is defined by the maximum change (range) in the average Y between any factor levels. Results are visualized in bar plot and saved as csv, including, min, max, std deviation across all levels 252 | * Computes RMSE between experiment result and ground truth; results saved as csv. 253 | * Ranks list of top experiments and their parameters based on RMSE 254 | * Computes average and variance of best parameters weighted with RMSE; saved to csv file 255 | * An overview plot of all the correlation plots between Y and each factor (see function `plot_regression`) 256 | * Moreover it will plot Y value for each pairwise combination of factors (see function `plot_3dmap`), which allows the user to visualise categorical factors 257 | 258 | ![Overview plot of X-Y Correlation for each factor as function of their level values. On top the linear regression coefficient `r` is shown along the linear regression fit and its uncertainty (line and shadow).](figures/Expresult_correlation_X_1.png){width=600} 259 | 260 | ![Cornerplot of pairwise factor relation with Y. The color(bar) indicates the value of Y.](figures/Expresult_pairwise-correlation_1.png){width=600} 261 | 262 | 263 | 264 | 265 | ## Use Case Study 266 | 267 | Here we demonstrate a typical use case where we would like to first generate and select an optimal experiment design. Then subsequently after running the experiment we would like to answer the question which is the best parameter space and what parameters are important. Our case study is given by the test example, which consists of 8 factors (parameters) that are specified in the experiment setup table `Experiment_setup_test.xlsx`. 268 | 269 | ![Test Experiment Setup Table with 6 discrete and 2 categorical factors. Each factor can have a certain number of levels (values), which are here either 3 or 2](figures/Setup_header_test.png){width=600} 270 | 271 | The first goal is to generate an efficient design with only a fraction of the entire parameter combination (in our case the full factorial would be $3^6 \times 2^2 = 2916$). The maximum number of experiments (in this case we choose 150) is set in the file `settings_design_test.yaml`, which also specifies input and output directory names, as well as the maximum time for optimising one run (in this case 100 seconds per design optimisation). This configuration will generate and optimize a range of experiments with different design run sizes from 12 to 150, in steps of 6 runsizes (since the lowest common multiple of our mix of 2 and 3 factor levels is 6). Note that the user can also choose a different stepsize, which can done by setting the value in the setting parameter `delta_nrun`. 272 | Now we are all setup to start the experiment design generation and optimisation script, which we do by running the script doegen.py with the settings file as argument: 273 | ``` bash 274 | $ cd DoEgen 275 | $ python -m doegen.doegen test/settings_design_test.yaml 276 | ``` 277 | This will generate for each runsize an optimised design array and a list of efficiencies and diagnostic tables and plots (see [Design Generation] for more details). To simplify the selection of the generated experiment designs, DoEgen suggests automatically three designs: 1) one minimum design (lowest number of runs at given efficiency threshold), 2) one optimal design, and 3) one best design (either equal or has larger experiment run number than optimal design). In our case the three design are selected for run numbers 30 (minimum), 72 (optimal), 90 (best). Since the optimal design has basically almost the same efficiencies as the best design (see figure below) but at a lower cost of experiment runs, we choose for our experiment the optimal design, which is given in the table `Designtable_optimal_Nrun72.csv`. 278 | 279 | ![Result Overview of Experiment Design Generation and the three suggested choices. The most important criteria for a good design are orthogonality (100% means that all factor pairs are 100% orthogonal to each other), level/center balance (100% is best) and two-way interaction balance (100% is best). We also want to make sure that at each pairwise interaction occurs at least one (100% Two-Level Min Efficiency). D-efficiency maximises the determinant of the information matrix $|X^T X|$, which corresponds to minimizing the generalized variance of the parameter estimates for a pre-specified model $X$. Here, D1-efficiency defines the model with only the main effects, while D-efficiency includes also all quadratic terms in the model $X$. Typically D1-efficiency should be larger than 60%, while D-efficiency only increases if number of experiments is much larger than the number of model terms. In this case study we consider only D1-efficiency given that we want to minimize the number of experiments.](figures/Results_overview.png){width=600} 280 | 281 | ![Header with first 5 rows of the optimal design with 72 experiments](figures/Designtable_optimal_Nrun72.png){width=600} 282 | 283 | Now it is time to run the experiment. In our example, we produce just some random data for the 72 experiments with 10 sensor locations (PID 1 to 10) and one output variable Y (e.g. temperature). To analyse the experiment, the results have to written in a structured table with the format as given in `experiment_results_Nrun72.xlsx` (see description in figure below). 284 | 285 | ![Header with first rows of the experiment result table for 72 experiments. Note that the `Nexp` number has to match the experiment design table `Nexp`. Each experiment (label `Nexp`) can have multiple locations or points (identifier# `PID`), e.g., if experiment is run at different locations simultaneously. In addition, it is possible that one has multiple output Y-variates, labeled with identifier `Y :abel` (target property that has to be predicted or evaluated, e.g. Rain and Temperature). The column `Y Exp` holds the experiment result for Y while the column `Y Truth`holds the ground truth value, which is required to calculate the RMSE and to select best parameter space.](figures/Experiment_result_Nrun72_header.png){width=600} 286 | 287 | To run the experiment analysis script, settings such as for input output directory names are given in the settings file `settings_expresults_test.yaml`, and we can now run the analysis script with 288 | ``` bash 289 | $ python -m doegen.doeval test/settings_expresults_test.yaml 290 | ``` 291 | This analysis produces a range of diagnostic tables and result plots for each output variable Y (in our case we have only one Y). One of the question of this example use case is to identify what factors are important, which is given in the figure `Ybarplot.png`. The "importance" basically indicates how much a factor changes Y (defined by the maximum average change in Y between any levels). This has the advantage to identify also important factors that have either a low linear regression coefficients with Y (see r values in plot `Expresult_correlation_X.png`) or are categorical. Such insight can be valuable to determine, e.g., which factors should be investigated in more detail in a subsequent experiment or to estiamate which factors have no effect on Y. 292 | 293 | ![Factor Importance ranked from maximum to lowest change (range) in Y](figures/Ybarplot_1.png){width=600} 294 | 295 | Another important question is what are the best parameter values based on the obtained experiment results so far? This question can be answered by computing the Root-Mean-Square-Error between experiment results and ground truth (or alternatively the likelihood if the model predictions include also uncertainties). Table `Experiment_1_RMSE_Top10_sorted.csv` provides an overview of the top 10 experiments sorted as function of their RMSE. Moroever we can calculate the (RMSE-weighted) average of each factor for the top experiments as shown in bar plot below. 296 | 297 | ![Picture of Table `Experiment_1_RMSE_Top10_sorted.csv` which shows the factor values of the top 10 experiments based on their RSME values.](figures/Top10.png){width=600} 298 | 299 | ![Factor values of the top 10 experiments based on their RSME values. The bar heights indicate the top factor's average value and the dark lines their standard deviation. Note that the average and their standard deviation are computed with the weights $RMSE^{-2}$.](figures/BestFactor_Avg1.png){width=600} 300 | 301 | Furthermore, multiple other diagnostics plots such as factor-Y correlation and pairwise correlation maps are generated (see [Experiment Result Analysis] for more details). 302 | 303 | 304 | 305 | 306 | ## Comparison to Other DoE Tools 307 | 308 | The aim of DoEgen is to provide an open-source tool for researchers to create optimised designs and a framework for transparent evaluation of experiment designs. Moreover, DoEgen aims to assist the result analysis that may allow the researcher a subsequent factor selection, parameter fine-tuning, or model building. The design generation function of DoEgen is build upon the excellent package `OApackage` and extends it further in terms of design efficiency evaluation, filtering, automation, and experiment analysis. There are multiple other tools available for DoE; the table below provides a brief (preliminary) summary of the main advantages and disadvantages for each tool that has been tested. 309 | 310 | 311 | Feature | SAS JMP | pyDOE2 | OApackage | DoEgen | 312 | ---------------------------|:---------:|:--------:|:---------:|:--------:| 313 | Open-Source | no (paid) | yes | yes | yes | 314 | Design Optimisation Score | very good | limited | good | good | 315 | Optimal Runsize Finder | no | no | no | yes | 316 | Design Efficiency Eval | yes | no | limited | yes | 317 | Exp Result Analysis | yes | no | no | yes | 318 | Development Stage | advanced | early | moderate |very early| 319 | 320 | 321 | 322 | ## Literature 323 | 324 | [OApackage: A Python package for generation and analysis of orthogonal arrays, optimal designs and conference designs](https://doi.org/10.21105/joss.01097), P.T. Eendebak, A.R. Vazquez, Journal of Open Source Software, 2019 325 | 326 | [pyDOE2: An experimental design package for python](https://github.com/clicumu/pyDOE2) 327 | 328 | Dean, A., Morris, M., Stufken, J. and Bingham, D. eds., 2015. Handbook of design and analysis of experiments (Vol. 7). CRC Press. 329 | 330 | Goos, P. and Jones, B., 2011. Optimal design of experiments: a case study approach. John Wiley & Sons. 331 | 332 | Kuhfeld, W.F., 2010. Discrete choice. SAS Technical Papers, 2010, pp.285-663. 333 | 334 | Zwerina, K., Huber, J. and Kuhfeld, W.F., 1996. A general method for constructing efficient choice designs. Durham, NC: Fuqua School of Business, Duke University. 335 | 336 | Cheong, Y.P. and Gupta, R., 2005. Experimental design and analysis methods for assessing volumetric uncertainties. SPE Journal, 10(03), pp.324-335. 337 | 338 | JMP, A. and Proust, M., 2010. Design of experiments guide. Cary, NC: SAS Institute Inc. 339 | 340 | 341 | 342 | ## Attribution and Acknowledgments 343 | 344 | Acknowledgments are an important way for us to demonstrate the value we bring to your research. Your research outcomes are vital for ongoing funding of the Sydney Informatics Hub. 345 | 346 | If you make use of this code for your research project, please include the following acknowledgment: 347 | 348 | “This research was supported by the Sydney Informatics Hub, a Core Research Facility of the University of Sydney.” 349 | 350 | 351 | ## Contributors 352 | 353 | We would like to thank Dietmar Muller (School of Geophyics, University of Sydney) for suggesting the need for this library, Danial Azam (School of Geophyics, University of Sydney) for testing DOEgen on real-world cases, Christopher Howden (SIH, University of Sydney) for 354 | statistical consultancy, literature suggestions, and documentation 355 | review, and Joel Nothman for the code review. 356 | 357 | DoEgen has benefited from the OApackage library [OApackage](https://github.com/eendebakpt/oapackage) for the design optimisation code and we would like to thank the researchers who have made their code available as open-source. 358 | 359 | 360 | ## License 361 | 362 | Copyright 2020 Sebastian Haan, The University of Sydney 363 | 364 | DoEgen is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License (AGPL version 3) as published by the Free Software Foundation. 365 | 366 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. 367 | 368 | You should have received a copy of the GNU Affero General Public License along with this program (see LICENSE.md). If not, see [https://www.gnu.org/licenses/](https://www.gnu.org/licenses/). 369 | -------------------------------------------------------------------------------- /docs/MANUAL.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docs/MANUAL.pdf -------------------------------------------------------------------------------- /doegen/Experiment_results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/Experiment_results.xlsx -------------------------------------------------------------------------------- /doegen/Experiment_setup.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/Experiment_setup.xlsx -------------------------------------------------------------------------------- /doegen/Experiment_setup_extended.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/Experiment_setup_extended.xlsx -------------------------------------------------------------------------------- /doegen/__init__.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #DoEgen: A Python Library for Optimised Design of Experiment Generation and Evaluation 3 | # 4 | #DoEgen is free software made available under the LGPL License. 5 | #For details see the LICENSE file. 6 | # 7 | #@author: Sebastian Haan 8 | 9 | """ 10 | DoEgen is a Python library aiming to assist in generating optimised 11 | Design of Experiments (DoE), evaluating design efficiencies, and 12 | analysing experiment results. 13 | 14 | In a first step, optimised designs can be automatically generated and 15 | efficiencies evaluated for any mixture of factor-levels for numeric and 16 | categorical factors. Designs are automatically evaluated as function of 17 | number of experiment runs and the most efficient designs are suggested. 18 | In particular DoEgen provides computation of a wide range of design 19 | efficiencies and allows to import and evaluate externally generated 20 | designs as well. 21 | 22 | The second part of DoEgen assists in analysing any derived experiment 23 | results in terms of factor importance, correlations, and response 24 | analysis for best parameter space selection. 25 | 26 | Definitions 27 | ----------- 28 | 29 | An Experiment Design is typically defined by: 30 | 31 | - Number of Factors: the parameters or variates of the experiment 32 | - Number of Runs: the number of experiments 33 | - Levels: The number of value options for each factor, which can be 34 | either numeric values (discrete or continuous) or categorical. 35 | Discrete levels for continuous factors can be obtained by providing 36 | the minimum and maximum of the factor range and the number of 37 | levels. The more levels, the more fine-grained the experiment will 38 | evaluate this factor, but also more experimental runs are required. 39 | 40 | The goal of optimising an experimental design is to provide an efficient 41 | design that is near-optimal in terms of, e.g., orthogonality, level 42 | balance, and two-way interaction coverage, yet can be performed with a 43 | minimum number of experimental runs, which are often costly or 44 | time-consuming. 45 | 46 | Functionality 47 | ------------- 48 | 49 | If you would like to jumpstart a new experiment and to skip the 50 | technical details, you can find a summary of the main usage of DoEgen in 51 | Case Study Use Case in the README. 52 | 53 | Currently, the (preliminary) release contains several functions for 54 | generating and evaluating designs. Importing and evaluating external 55 | designs is supported (e.g. for comparison to other DoE generator tools). 56 | DoE also implements several functions for experiment result analysis and 57 | visualisation of parameter space. 58 | 59 | The main functionalities are (sorted in order of typical experiment 60 | process): 61 | 62 | - Reading Experiment Setup Table and Settings (Parameter Name, Levels 63 | for each factor, Maximum number of runs, Min/Max etc) 64 | - Generating optimised design arrays for a range of runs (given 65 | maximum number of runs, and optional computation-time constrains, 66 | see `settings_design.yaml`). 67 | - Evaluation and visualisation of more than ten design efficiencies 68 | such as level balance, orthogonality, D-efficiencies etc (see 69 | [Design Efficiencies](#design-efficiencies) for the complete list). 70 | - Automatic suggestion of minimum, optimal, and best designs within a 71 | given range of experiment runs. 72 | - Import and evaluation of externally generated design arrays. 73 | - Experiment result analysis: Template table for experiment results, 74 | multi-variant RMSE computation, best model/parameter selection, 75 | Factor Importance computation, pairwise response surface and 76 | correlation computation, factor correlation analysis and Two-way 77 | interaction response plots. 78 | - Visualisation of experiment results. 79 | 80 | Installation And Requirements 81 | ----------------------------- 82 | 83 | ### Requirements 84 | 85 | - Python >= 3.6 86 | - OApackage 87 | - xlrd 88 | - XlsxWriter 89 | - openpyxl 90 | - Numpy 91 | - Pandas 92 | - PyYAML 93 | - scikit_learn 94 | - matplotlib 95 | - seaborn 96 | 97 | The DoEgen package is currently considered experimental and has been 98 | tested with the libraries specified in `requirements.txt`. 99 | 100 | Installation instructions and documentation for OApackage (tested with 101 | OApackage 2.7.11) can be found at https://pypi.org/project/OApackage/ or 102 | can be installed with 103 | 104 | pip install OAPackage 105 | 106 | Please see for more details the README. 107 | """ 108 | 109 | __version__ = "0.5.0" 110 | __author__ = "Sebastian Haan" -------------------------------------------------------------------------------- /doegen/configloader.py: -------------------------------------------------------------------------------- 1 | # Load settings 2 | 3 | import argparse 4 | import yaml 5 | 6 | ap = argparse.ArgumentParser() 7 | ap.add_argument('settings_path', nargs='?', default='settings_design.yaml') 8 | args = ap.parse_args() 9 | print(f"using settings in: {args.settings_path!r}") 10 | with open(args.settings_path) as f: 11 | cfg = yaml.safe_load(f) 12 | for key in cfg: 13 | locals()[str(key)] = cfg[key] -------------------------------------------------------------------------------- /doegen/configloader_results.py: -------------------------------------------------------------------------------- 1 | # Load settings 2 | 3 | import argparse 4 | import yaml 5 | 6 | ap = argparse.ArgumentParser() 7 | ap.add_argument('settings_path', nargs='?', default='settings_expresults.yaml') 8 | args = ap.parse_args() 9 | print(f"using settings in: {args.settings_path!r}") 10 | with open(args.settings_path) as f: 11 | cfg = yaml.safe_load(f) 12 | for key in cfg: 13 | locals()[str(key)] = cfg[key] -------------------------------------------------------------------------------- /doegen/create_resultfile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates Excel file with format for experimental design setup 3 | 4 | Author: Sebastian Haan 5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney 6 | Version: 0.1 7 | License: APGL-3.0 8 | """ 9 | 10 | import xlsxwriter 11 | 12 | workbook = xlsxwriter.Workbook('Experiment_results_template.xlsx') 13 | worksheet = workbook.add_worksheet() 14 | 15 | workbook.set_properties({ 16 | 'title': 'Experimental Design Results', 17 | 'subject': 'Template', 18 | 'author': 'Sebastian Haan', 19 | 'company': 'SIH, The University of Sydney', 20 | 'comments': 'Created with Python and XlsxWriter' 21 | }) 22 | 23 | # Add a format for the header cells. 24 | header_format = workbook.add_format({ 25 | 'border': 2, 26 | 'bg_color': '#C6EFCE', 27 | 'bold': True, 28 | 'text_wrap': True, 29 | 'valign': 'bottom', 30 | 'indent': 1, 31 | 'locked': True 32 | }) 33 | 34 | unlocked = workbook.add_format({'locked': False}) 35 | # Enable worksheet protection 36 | #worksheet.protect(options={'autofilter': True}) 37 | #worksheet.autofilter('A1:B8') 38 | 39 | worksheet.protect() 40 | 41 | header_format.set_font_size(14) 42 | 43 | worksheet.set_default_row(20) 44 | worksheet.set_row(0, 20) 45 | 46 | worksheet.set_column('A:H', 15, unlocked) 47 | 48 | # Write the header cells 49 | # Same identifier as in setupfile, this need to match with experimemnt setup file to merge with associated parameters! 50 | heading1 = 'Nexp' 51 | # Optional: ID of measurement point (e.g. spatial or temporal position), 52 | heading2 = 'PID' 53 | # Optional: index of multi output-target if applicable (optional) 54 | #(e.g. Y can be distinct properties or target values even for same PID) 55 | heading3 = 'Y Label' 56 | # Experiment or simulation result for given position PID and output Y Label 57 | heading4 = 'Y Exp' 58 | # Optional: ground truth for given PID and output Y Label 59 | heading5 = 'Y Truth' 60 | # Optional: Standard deviation (noise) of experiment result for given position PID and output Y Label 61 | heading6 = 'Std Y Exp' 62 | # Optional: Standard deviation (noise) of ground truth for given position PID and output Y Label 63 | heading7 = 'Std Y Truth' 64 | # Optional: weight for positional measurement with PID 65 | heading8 = 'Weight PID' 66 | 67 | worksheet.write('A1', heading1, header_format) 68 | worksheet.write('B1', heading2, header_format) 69 | worksheet.write('C1', heading3, header_format) 70 | worksheet.write('D1', heading4, header_format) 71 | worksheet.write('E1', heading5, header_format) 72 | worksheet.write('F1', heading6, header_format) 73 | worksheet.write('G1', heading7, header_format) 74 | worksheet.write('H1', heading8, header_format) 75 | 76 | #Freeze panes 77 | worksheet.freeze_panes(1, 0) # Freeze the first row 78 | 79 | workbook.close() 80 | 81 | print('Excel Template Created') -------------------------------------------------------------------------------- /doegen/create_setupfile.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates Excel file with format for experimental design setup 3 | 4 | Author: Sebastian Haan 5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney 6 | Version: 0.1 7 | License: LGPL-3.0 8 | """ 9 | 10 | import xlsxwriter 11 | 12 | workbook = xlsxwriter.Workbook('Experiment_setup_template.xlsx') 13 | worksheet = workbook.add_worksheet() 14 | 15 | workbook.set_properties({ 16 | 'title': 'Experimental Design Setup', 17 | 'subject': 'Template', 18 | 'author': 'Sebastian Haan', 19 | 'company': 'SIH, The University of Sydney', 20 | 'comments': 'Created with Python and XlsxWriter' 21 | }) 22 | 23 | # Add a format for the header cells. 24 | header_format = workbook.add_format({ 25 | 'border': 2, 26 | 'bg_color': '#C6EFCE', 27 | 'bold': True, 28 | 'text_wrap': True, 29 | 'valign': 'bottom', 30 | 'indent': 1, 31 | 'locked': True 32 | }) 33 | 34 | unlocked = workbook.add_format({'locked': False}) 35 | # Enable worksheet protection 36 | #worksheet.protect(options={'autofilter': True}) 37 | #worksheet.autofilter('A1:B8') 38 | 39 | worksheet.protect() 40 | 41 | header_format.set_font_size(14) 42 | 43 | worksheet.set_default_row(20) 44 | worksheet.set_row(0, 30) 45 | 46 | worksheet.set_column('A:E', None, unlocked) 47 | 48 | # Set up layout of the worksheet. 49 | worksheet.set_column('A:A', 50, unlocked) 50 | worksheet.set_column('B:B', 20, unlocked) 51 | worksheet.set_column('C:C', 20, unlocked) 52 | worksheet.set_column('D:D', 20, unlocked) 53 | worksheet.set_column('E:E', 20, unlocked) 54 | 55 | 56 | # Write the header cells and some data that will be used in the examples. 57 | heading1 = 'Parameter Name' 58 | heading2 = 'Parameter Type' 59 | heading3 = 'Level Number' 60 | heading4 = 'Minimum' 61 | heading5 = 'Maximum' 62 | 63 | worksheet.write('A1', heading1, header_format) 64 | worksheet.write('B1', heading2, header_format) 65 | worksheet.write('C1', heading3, header_format) 66 | worksheet.write('D1', heading4, header_format) 67 | worksheet.write('E1', heading5, header_format) 68 | 69 | #worksheet.write_row('B2:B10', ['Continous', 'Discrete', 'Categorical']) 70 | worksheet.data_validation('B2:B20', {'validate': 'list', 71 | 'source': ['Continuous', 'Discrete', 'Categorical']}) 72 | #worksheet.write_row('C2:C10', ['Integers', 2, 10]) 73 | worksheet.data_validation('C2:C20', {'validate': 'integer', 74 | 'criteria': 'between', 75 | 'minimum': 2, 76 | 'maximum': 10}) 77 | 78 | #Freeze panes 79 | worksheet.freeze_panes(1, 0) # Freeze the first row 80 | 81 | workbook.close() -------------------------------------------------------------------------------- /doegen/create_setupfile_extended.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates Excel file with format for experimental design setup 3 | 4 | Author: Sebastian Haan 5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney 6 | Version: 0.1 7 | License: LGPL-3.0 8 | """ 9 | 10 | import xlsxwriter 11 | 12 | workbook = xlsxwriter.Workbook('Experiment_setup_extended.xlsx') 13 | worksheet = workbook.add_worksheet() 14 | 15 | workbook.set_properties({ 16 | 'title': 'Experimental Design Setup (Extended Version)', 17 | 'subject': 'Template', 18 | 'author': 'Sebastian Haan', 19 | 'company': 'SIH, The University of Sydney', 20 | 'comments': 'Created with Python and XlsxWriter' 21 | }) 22 | 23 | # Add a format for the header cells. 24 | header_format = workbook.add_format({ 25 | 'border': 2, 26 | 'bg_color': '#C6EFCE', 27 | 'bold': True, 28 | 'text_wrap': True, 29 | 'valign': 'bottom', 30 | 'indent': 1, 31 | 'locked': True 32 | }) 33 | 34 | unlocked = workbook.add_format({'locked': False}) 35 | # Enable worksheet protection 36 | #worksheet.protect(options={'autofilter': True}) 37 | #worksheet.autofilter('A1:B8') 38 | 39 | worksheet.protect() 40 | 41 | header_format.set_font_size(14) 42 | 43 | worksheet.set_default_row(20) 44 | worksheet.set_row(0, 30) 45 | 46 | worksheet.set_column('A:E', None, unlocked) 47 | 48 | # Set up layout of the worksheet. 49 | worksheet.set_column('A:A', 40, unlocked) 50 | worksheet.set_column('B:B', 20, unlocked) 51 | worksheet.set_column('C:C', 20, unlocked) 52 | worksheet.set_column('D:D', 20, unlocked) 53 | worksheet.set_column('E:E', 20, unlocked) 54 | worksheet.set_column('F:F', 20, unlocked) 55 | worksheet.set_column('G:G', 50, unlocked) 56 | 57 | 58 | # Write the header cells and some data that will be used in the examples. 59 | heading1 = 'Parameter Name' 60 | heading2 = 'Parameter Type' 61 | heading3 = 'Level Number' 62 | heading4 = 'Minimum' 63 | heading5 = 'Maximum' 64 | heading6 = 'Include (Y/N)' 65 | heading7 = 'Levels' 66 | 67 | worksheet.write('A1', heading1, header_format) 68 | worksheet.write('B1', heading2, header_format) 69 | worksheet.write('C1', heading3, header_format) 70 | worksheet.write('D1', heading4, header_format) 71 | worksheet.write('E1', heading5, header_format) 72 | worksheet.write('F1', heading6, header_format) 73 | worksheet.write('G1', heading7, header_format) 74 | 75 | #worksheet.write_row('B2:B10', ['Continous', 'Discrete', 'Categorical']) 76 | worksheet.data_validation('B2:B20', {'validate': 'list', 77 | 'source': ['Continuous', 'Discrete', 'Categorical']}) 78 | #worksheet.write_row('C2:C10', ['Integers', 2, 10]) 79 | worksheet.data_validation('C2:C20', {'validate': 'integer', 80 | 'criteria': 'between', 81 | 'minimum': 2, 82 | 'maximum': 10}) 83 | worksheet.data_validation('F2:F20', {'validate': 'list', 84 | 'source': ['Yes', 'No']}) 85 | 86 | #Freeze panes 87 | worksheet.freeze_panes(1, 0) # Freeze the first row 88 | 89 | workbook.close() -------------------------------------------------------------------------------- /doegen/doeval.py: -------------------------------------------------------------------------------- 1 | """ 2 | Package to evaluate the response and factor effectiveness of experiment results. 3 | 4 | Author: Sebastian Haan 5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney 6 | Version: Experimental 7 | License: LGPL-3.0 8 | 9 | Tested with Python 3.7 10 | 11 | Main Capabilities: 12 | - Multi-variant RMSE computation and best model/parameter selection 13 | - Factor Importance computation 14 | - Pairwise response surface and correlation computation 15 | - Factor correlation analysis and Two -way intearction response plots 16 | - Visualisation plots 17 | 18 | ToDo: 19 | Change to Pathlib 20 | 21 | Changes to previous version: 22 | - replace configloader with function arguments 23 | """ 24 | 25 | import os 26 | import sys 27 | import argparse 28 | import yaml 29 | from pathlib import Path 30 | import numpy as np 31 | import pandas as pd 32 | from mpl_toolkits import mplot3d 33 | import matplotlib.pyplot as plt 34 | import seaborn as sns 35 | import matplotlib.ticker as ticker 36 | 37 | sns.set() 38 | # Load settings parameters 39 | 40 | 41 | def merge_expresults(fname_result, fname_design, y_label=None): 42 | """ 43 | Reads experiment results into pandas dataframe 44 | and merges with paramater file 45 | 46 | INPUT 47 | fname_result: path + filenmae of experimental results (see excel template) 48 | fname_design: path + filenmae of experimental design setip (see excel template) 49 | y_label: (Default None) Column name for precited y property 50 | """ 51 | dfres = pd.read_excel(fname_result) 52 | dfdes = pd.read_excel(fname_design) 53 | if y_label is not None: 54 | dfres = dfres[dfres["Y Label"] == y_label] 55 | # Merge two files: 56 | dfcomb = dfres.merge(dfdes, on="Nexp", how="left") 57 | return dfcomb 58 | 59 | 60 | def create_testdata(outpath, fname_out, Nexp): 61 | """ 62 | # Script for creating random set of results for testing 63 | 64 | INPUT 65 | outpath: output directory 66 | fname_out: filename in format '*.xlsx' 67 | Nexp: NUmber of experiments 68 | """ 69 | os.makedirs(outpath, exist_ok=True) 70 | PID = np.arange(1, 11) 71 | Yexp = np.random.rand(Nexp, len(PID)).flatten() 72 | Ytruth = np.random.rand(Nexp, len(PID)).flatten() 73 | Ylabel = np.ones(len(Yexp)) 74 | aNexp = np.zeros_like(Yexp) 75 | aPID = np.zeros_like(Yexp) 76 | for i in range(Nexp): 77 | aNexp[10 * i : 10 * i + 10] = i + 1 78 | aPID[10 * i : 10 * i + 10] = PID 79 | array = np.vstack( 80 | ( 81 | aNexp, 82 | aPID, 83 | Ylabel, 84 | Yexp, 85 | Ytruth, 86 | Ytruth * np.nan, 87 | Ytruth * np.nan, 88 | Ytruth * np.nan, 89 | ) 90 | ) 91 | header = [ 92 | "Nexp", 93 | "PID", 94 | "Y Label", 95 | "Y Exp", 96 | "Y Truth", 97 | "Std Y Exp", 98 | "Std Y Truth", 99 | "Weight PID", 100 | ] 101 | df = pd.DataFrame(array.T, columns=header) 102 | df.to_excel(os.path.join(outpath,fname_out), index=False) 103 | 104 | 105 | def weighted_avg_and_std(values, weights): 106 | """ 107 | Returns weighted average and standard deviation. 108 | 109 | INPUT 110 | values, weights -- arrays with the same shape. 111 | """ 112 | average = np.average(values, weights=weights) 113 | # Fast and numerically precise: 114 | variance = np.average((values - average) ** 2, weights=weights) 115 | return (average, np.sqrt(variance)) 116 | 117 | 118 | def calc_expresults_stats(ylabels, dfdes, dfres, outpath): 119 | """ 120 | Computation of statistical evaluation of experimetal results for each predicted y: 121 | 1) Parameter importance, which is defined by maximum y range over parameter levels (y in averaged for each level) 122 | Results are visualized in bar plot and saved as csv, including, min, max, std devioation across all levels 123 | 2) Computes RMSE and saves results as csv 124 | 3) Computes list of top experiments and their parameters 125 | 4) Computes average and variance of best parameters weighted with RMSE; saved to csv file 126 | 127 | INPUT 128 | ylabels: label ID for each target variable. 129 | dfdes: experiment design dataframe (inlcudes one column Nexp and the other columns the factor names) 130 | dfres: experiment result dataframe (Ids in Nexp column must match design array) 131 | outpath: path for output files 132 | """ 133 | npar = len(list(dfdes)) - 1 134 | nexp = len(dfdes) 135 | params = list(dfdes)[1:] 136 | for ylabel in ylabels: 137 | dfdes_y = dfdes.copy() 138 | # Initialise array for factor results 139 | ymin_par = np.full(npar, np.nan) 140 | ymax_par = np.zeros(npar) * np.nan 141 | ystd_par = np.zeros(npar) * np.nan 142 | ymean_par = np.zeros(npar) * np.nan 143 | # Select Y data and to dfdes overall stats in dfdes 144 | ydf = dfres[dfres["Y Label"] == ylabel].copy() 145 | ymean = ydf.fillna(0).groupby("Nexp")["Y Exp"].mean() 146 | ystd = ydf.fillna(0).groupby("Nexp")["Y Exp"].std() 147 | ytruemean = ydf.fillna(0).groupby("Nexp")["Y Truth"].mean() 148 | ytruestd = ydf.fillna(0).groupby("Nexp")["Y Truth"].std() 149 | assert len(ymean) == dfdes_y.shape[0] 150 | dfdes_y["Y Exp Mean"] = ymean.values 151 | dfdes_y["Y Exp Std"] = ystd.values 152 | dfdes_y["Y Truth Mean"] = ytruemean.values 153 | dfdes_y["Y Truth Std"] = ytruestd.values 154 | # Loop over parameter to caluclate factor range, min, max, mean and stddev: 155 | for i, param in enumerate(params): 156 | levels = dfdes[param].unique() 157 | ylevel = [ 158 | np.nanmean(dfdes_y.loc[dfdes[param] == level, "Y Exp Mean"]) 159 | for level in levels 160 | ] 161 | ylevelstd = [ 162 | np.nanmean(dfdes_y.loc[dfdes[param] == level, "Y Exp Std"]) 163 | for level in levels 164 | ] 165 | ymin_par[i] = np.nanmin(ylevel) 166 | ymax_par[i] = np.nanmax(ylevel) 167 | ystd_par[i] = np.nanstd(ylevel) 168 | ymean_par[i] = np.nanmean(ylevel) 169 | ypos = np.arange(npar) 170 | width = ymax_par - ymin_par 171 | sort = np.argsort(width) 172 | # Plot factor importance as barplot 173 | plt.ioff() 174 | plt.figure(figsize=(8, 5)) 175 | plt.barh( 176 | ypos, 177 | width=width[sort], 178 | left=ymin_par[sort], 179 | tick_label=np.asarray(params)[sort], 180 | color="red", 181 | ) 182 | plt.title("Range " + str(ylabel)) 183 | plt.tight_layout() 184 | plt.savefig(os.path.join(outpath, "Ybarplot_" + str(ylabel) + ".png"), dpi=300) 185 | plt.close() 186 | # Save factor importance to csv: 187 | res = np.vstack((width, ymin_par, ymax_par, ymean_par, ystd_par)) 188 | dfrange = pd.DataFrame( 189 | res.T, columns=["Yrange", "Ymin", "Ymax", "Ymean", "Ystd"], index=params 190 | ) 191 | dfrange.to_csv( 192 | os.path.join(outpath, "Experiment_" + str(ylabel) + "_Factorimportance.csv") 193 | ) 194 | 195 | # Calculate RMSE and best parameter space: 196 | if ydf["Y Truth"].notnull().any(): 197 | rmse = np.zeros(nexp) 198 | ytrue = np.zeros(nexp) 199 | for i in range(nexp): 200 | resid = ( 201 | ydf.loc[ydf["Nexp"] == i + 1, "Y Exp"] 202 | - ydf.loc[ydf["Nexp"] == i + 1, "Y Truth"] 203 | ) 204 | rmse[i] = np.sqrt(np.nanmean(resid ** 2)) 205 | dfdes_y["RMSE"] = rmse 206 | # Save overall results to csv with sorted RMSE 207 | dfdes_y.to_csv(os.path.join(outpath, "Experiment_" + str(ylabel) + "_RMSE.csv")) 208 | 209 | # Calculate best parameters (for only nueric parameters) 210 | if nexp >= 20: 211 | nsel = 10 212 | elif (nexp >= 10) & (nexp < 20): 213 | nsel = 5 214 | else: 215 | nsel = 3 216 | dfsort = dfdes_y.sort_values(["RMSE"], ascending=True) 217 | print( 218 | "Top " 219 | + str(nsel) 220 | + " experiments with best RMSE for " 221 | + str(ylabel) 222 | + " :" 223 | ) 224 | print(dfsort.head(nsel)) 225 | dfsort.iloc[0:nsel].to_csv( 226 | os.path.join(outpath, 227 | "Experiment_" 228 | + str(ylabel) 229 | + "_RMSE_Top" 230 | + str(nsel) 231 | + "_sorted.csv") 232 | ) 233 | """ takingh out best parameter weighting since averaging might be misleading 234 | # best parameter space is based on weighted RMSE of top results 235 | # Note that these are average parameter estimaets and are not considering multi-modal distributions 236 | # For multi-modal see list of top resulst 237 | # Select only numeric parameters 238 | params_num = dfsort[params]._get_numeric_data().columns 239 | param_wmean = np.zeros(len(params_num)) # weigthed mean 240 | param_wstd = np.zeros(len(params_num)) # weighted std 241 | dfsel = dfsort.iloc[0:nsel] 242 | for i, param in enumerate(params_num): 243 | param_wmean[i], param_wstd[i] = weighted_avg_and_std( 244 | dfsel[param].values, 1 / (dfsel["RMSE"].values ** 2) 245 | ) 246 | params_stats = np.vstack((param_wmean, param_wstd)) 247 | # Save to csv 248 | dfparam_avg = pd.DataFrame( 249 | params_stats.T, 250 | index=params_num, 251 | columns=["Weighted Average", "Weigted Stddev"], 252 | ) 253 | dfparam_avg.to_csv( 254 | cfg.outpath + "Experiment_" + str(ylabel) + "_Best-Parameter-Avg.csv" 255 | ) 256 | # plot dataframe table 257 | plot_table(dfparam_avg, cfg.outpath, "BestFactor_Avg_" + str(ylabel) + ".png") 258 | """ 259 | """ 260 | dfparam_avg.plot(kind="bar", y="Weighted Average", yerr="Weigted Stddev") 261 | plt.tight_layout() 262 | """ 263 | 264 | 265 | 266 | #clean up x and y axis plots if there are too many decimal points or scientific notation. 267 | def nicexAxis(ax): 268 | for item in ax.get_xticklabels(): 269 | val = item.get_text() 270 | if 'e' in val: 271 | ax.set_xticklabels(['{:.3g}'.format(float(label)) for label in [item.get_text() for item in ax.get_xticklabels()]]) 272 | else: 273 | ax.set_xticklabels([str(round(float(label), 3)) for label in [item.get_text() for item in ax.get_xticklabels()]]) 274 | return ax 275 | 276 | def niceyAxis(ax): 277 | for item in ax.get_yticklabels(): 278 | val = item.get_text() 279 | if 'e' in val: 280 | ax.set_yticklabels(['{:.3g}'.format(float(label)) for label in [item.get_text() for item in ax.get_yticklabels()]]) 281 | else: 282 | ax.set_yticklabels([str(round(float(label), 3)) for label in [item.get_text() for item in ax.get_yticklabels()]]) 283 | return ax 284 | 285 | # Make 3d correlation plot with heatmap 286 | # (Make 3d scatter to image plot (works only for continous)) 287 | def plot_3dmap(df, params, target_name, fname_out): 288 | nfac = len(params) 289 | # Check first for max and min value 290 | ymin0 = df[target_name].max() 291 | ymax0 = df[target_name].min() 292 | for i in range(nfac - 1): 293 | for j in range(i + 1, nfac): 294 | table = pd.pivot_table( 295 | df, 296 | values=target_name, 297 | index=[params[j]], 298 | columns=[params[i]], 299 | aggfunc=np.nanmean, 300 | ) 301 | if np.min(table.min()) < ymin0: 302 | ymin0 = np.min(table.min()) 303 | if np.max(table.max()) > ymax0: 304 | ymax0 = np.max(table.max()) 305 | 306 | # Make corner plot 307 | plt.ioff() # automatic disables display of figures 308 | fig, axs = plt.subplots(nfac - 1, nfac - 1, figsize=(nfac * 2, nfac * 2)) 309 | for i in range(nfac - 1): 310 | for j in range(i + 1, nfac): 311 | table = pd.pivot_table( 312 | df, 313 | values=target_name, 314 | index=[params[j]], 315 | columns=[params[i]], 316 | aggfunc=np.nanmean, 317 | ) 318 | g = sns.heatmap( 319 | table, 320 | cmap="Spectral", 321 | annot=False, 322 | ax=axs[j - 1, i], 323 | vmin=ymin0, 324 | vmax=ymax0, 325 | square=True, 326 | cbar=False, 327 | ) 328 | 329 | if params[j] == 'N' or params[j] == 'Erodibility' or params[j] == 'MNrat': 330 | axs[j - 1, i] = niceyAxis(axs[j - 1, i]) 331 | 332 | if params[i] == 'N' or params[i] == 'Erodibility' or params[i] == 'MNrat': 333 | axs[j - 1, i] = nicexAxis(axs[j - 1, i]) 334 | 335 | if i > 0: 336 | g.set_ylabel("") 337 | g.set(yticklabels=[]) 338 | if j < nfac - 1: 339 | g.set_xlabel("") 340 | g.set(xticklabels=[]) 341 | # remove remaining plots: 342 | for i in range(1, nfac - 1): 343 | for j in range(1, i + 1): 344 | g = sns.heatmap( 345 | table * np.nan, 346 | cmap="Spectral", 347 | annot=False, 348 | ax=axs[j - 1, i], 349 | vmin=ymin0, 350 | vmax=ymax0, 351 | square=True, 352 | cbar=False, 353 | ) 354 | g.set_ylabel("") 355 | g.set(yticklabels=[]) 356 | g.set_xlabel("") 357 | g.set(xticklabels=[]) 358 | 359 | # Make colorbar 360 | g = sns.heatmap( 361 | table * np.nan, 362 | cmap="Spectral", 363 | annot=False, 364 | ax=axs[0, 1], 365 | vmin=ymin0, 366 | vmax=ymax0, 367 | square=True, 368 | cbar=True, 369 | ) 370 | g.set_xlabel("") 371 | g.set_ylabel("") 372 | g.set(yticklabels=[]) 373 | g.set(xticklabels=[]) 374 | print(target_name) ##debug 375 | fig.suptitle("Pair-Variate Plot for "+str(target_name)+" Function") 376 | plt.savefig(fname_out, dpi=300, bbox_inches="tight") 377 | 378 | ## This doesn't appear to get used anywhere from the main doeval function, can call this individually? 379 | def plot_3dmap_rmse(df, params, fname_out): 380 | """ 381 | Plots RMSE value as function of two differnt X variates for each pairwise combination of factors 382 | The plot is using a gridded heatmap which enablesto visualise also categorical factors 383 | and not just numerical data 384 | 385 | INPUT 386 | df: pandas dataframe 387 | params: list of factor names 388 | target_name: 389 | dfname_out: output path + filename for image 390 | 391 | OUTPUT 392 | Cornerplot of Y-PairwiseCorrelation Images 393 | """ 394 | print('Plotting RMSE as function of pairwise covariates ...') 395 | nfac = len(params) 396 | # Check first for max and min value 397 | ymin0 = df["RMSE"].max() 398 | ymax0 = df["RMSE"].min() 399 | for i in range(nfac - 1): 400 | for j in range(i + 1, nfac): 401 | table = pd.pivot_table( 402 | df, 403 | values="RMSE", 404 | index=[params[j]], 405 | columns=[params[i]], 406 | aggfunc=np.nanmean, 407 | ) 408 | if np.min(table.min()) < ymin0: 409 | ymin0 = np.min(table.min()) 410 | if np.max(table.max()) > ymax0: 411 | ymax0 = np.max(table.max()) 412 | # Make corner plot 413 | # sns.set_style("whitegrid") 414 | plt.ioff() # automatic disables display of figures 415 | # fig, axs = plt.subplots(nfac-1, nfac-1, sharex=True, sharey=True, figsize=(nfac*2, nfac*2)) 416 | fig, axs = plt.subplots(nfac - 1, nfac - 1, figsize=(nfac * 2, nfac * 2)) 417 | for i in range(nfac - 1): 418 | for j in range(i + 1, nfac): 419 | table = pd.pivot_table( 420 | df, 421 | values="RMSE", 422 | index=[params[j]], 423 | columns=[params[i]], 424 | aggfunc=np.nanmean, 425 | ) 426 | g = sns.heatmap( 427 | table, 428 | cmap="viridis", 429 | annot=False, 430 | ax=axs[j - 1, i], 431 | vmin=ymin0, 432 | vmax=ymax0, 433 | square=True, 434 | cbar=False, 435 | ) 436 | if i > 0: 437 | g.set_ylabel("") 438 | g.set(yticklabels=[]) 439 | if j < nfac - 1: 440 | g.set_xlabel("") 441 | g.set(xticklabels=[]) 442 | # remove remaining plots: 443 | for i in range(1, nfac - 1): 444 | for j in range(1, i + 1): 445 | g = sns.heatmap( 446 | table * np.nan, 447 | cmap="viridis", 448 | annot=False, 449 | ax=axs[j - 1, i], 450 | vmin=ymin0, 451 | vmax=ymax0, 452 | square=True, 453 | cbar=False, 454 | ) 455 | g.set_ylabel("") 456 | g.set(yticklabels=[]) 457 | g.set_xlabel("") 458 | g.set(xticklabels=[]) 459 | # Make colorbar 460 | g = sns.heatmap( 461 | table * np.nan, 462 | cmap="viridis", 463 | annot=False, 464 | ax=axs[0, 1], 465 | vmin=ymin0, 466 | vmax=ymax0, 467 | square=True, 468 | cbar=True, 469 | ) 470 | g.set_xlabel("") 471 | g.set_ylabel("") 472 | g.set(yticklabels=[]) 473 | g.set(xticklabels=[]) 474 | fig.suptitle("Pair-Variate Plot for RMSE Function") 475 | plt.savefig(fname_out, dpi=300) 476 | 477 | 478 | def plot_regression(df, params, target_name, fname_out): 479 | """ 480 | Creates Correlation plot with Y or RMSE for each numeric Variate 481 | Note that only numeric data is selected for this plot 482 | 483 | INPUT 484 | df: dataframe 485 | params: list of factor names 486 | target_name: 'Y Exp Mean' or 'RMSE' 487 | 488 | OUTPUT 489 | Image with Correlations 490 | """ 491 | # Select numeric variates: 492 | columns = df[params]._get_numeric_data().columns 493 | nfac = len(columns) 494 | nax1 = int(np.sqrt(nfac)) 495 | nax2 = int(np.ceil(nfac / int(np.sqrt(nfac)))) 496 | # fig, axs = plt.subplots(nax1, nax2, figsize=(nax1 * 3, nax2 * 3)) 497 | plt.ioff() # automatic disables display of figures 498 | fig = plt.figure(figsize=(nax1 * 5, nax2 * 4)) 499 | for i in range(nfac): 500 | r = df[columns[i]].corr(df[target_name]) 501 | plt.subplot(nax2, nax1, i + 1) 502 | # sns.lmplot( x = columns[0], y = 'Y Exp Mean', data = df) 503 | ax = sns.regplot(x=columns[i], y=target_name, data=df) 504 | ax.annotate("r = {:.3f}".format(r), xy=(0.1, 0.9), xycoords=ax.transAxes) 505 | plt.savefig(fname_out, dpi=300) 506 | 507 | def plot_factordis(df, params, target_name, fname_out): 508 | """ 509 | Creates distribution plot of Y or RMSE for each numeric Variate 510 | Note that only numeric data is selected for this plot 511 | 512 | INPUT 513 | df: dataframe 514 | params: list of factor names 515 | target_name: 'Y Exp Mean' or 'RMSE' 516 | 517 | OUTPUT 518 | Image with Correlations 519 | """ 520 | # Select numeric variates: 521 | columns = df[params]._get_numeric_data().columns 522 | nfac = len(columns) 523 | nax1 = int(np.sqrt(nfac)) 524 | nax2 = int(np.ceil(nfac / int(np.sqrt(nfac)))) 525 | # fig, axs = plt.subplots(nax1, nax2, figsize=(nax1 * 3, nax2 * 3)) 526 | plt.ioff() # automatic disables display of figures 527 | fig = plt.figure(figsize=(nax1 * 5, nax2 * 4)) 528 | for i in range(nfac): 529 | plt.subplot(nax2, nax1, i + 1) 530 | ax = sns.violinplot(y=df[target_name], x=df[columns[i]]) 531 | plt.savefig(fname_out, dpi=300) 532 | 533 | def plot_table(df_table, outpath, fname_out): 534 | """ 535 | Plot Dataframe as formatted table 536 | 537 | INPUT 538 | df_table: dataframe 539 | outpath: output path 540 | fname_out: image output filename 541 | """ 542 | # Format table 543 | 544 | #df_table.to_string(float_format=lambda x: '%.3f' % x) 545 | plt.ioff() 546 | plt.figure(linewidth=2, 547 | tight_layout={'pad':40}, 548 | figsize=(7,4) 549 | ) 550 | # Set colors for row and column headers 551 | rcolors = plt.cm.BuPu(np.full(len(df_table), 0.15)) 552 | ccolors = plt.cm.BuPu(np.full(len(list(df_table)), 0.15)) 553 | 554 | # Hide axes 555 | ax = plt.gca() 556 | ax.get_xaxis().set_visible(False) 557 | ax.get_yaxis().set_visible(False) 558 | table = pd.plotting.table(ax, df_table, loc = 'center', 559 | rowLoc='left', 560 | colLoc = 'center', 561 | rowColours=rcolors, 562 | colColours=ccolors) 563 | table.scale(0.6, 1.3) 564 | table.set_fontsize(7) 565 | plt.box(on=None) 566 | # plt.tight_layout(pad = 40) 567 | plt.draw() 568 | plt.savefig(os.path.join(outpath, fname_out), dpi=300) 569 | plt.close() 570 | 571 | 572 | def main(inpath, fname_results, fname_design, outpath = None): 573 | 574 | if outpath is None: 575 | outpath = inpath = Path(inpath) 576 | else: 577 | outpath = Path(outpath) 578 | os.makedirs(outpath, exist_ok = True) 579 | # 1) Read in experiment result data 580 | if fname_results.endswith('.xlsx'): 581 | dfres = pd.read_excel(os.path.join(inpath, fname_results)) 582 | elif fname_results.endswith('.csv'): 583 | dfres = pd.read_csv(os.path.join(inpath, fname_results)) 584 | # ['Nexp' 'PID', 'Y Label', 'Y Exp', 'Y Truth', 'Std Y Exp', 'Std Y Truth', 'Weight PID'] 585 | # 2) Read in experiment design setup table with parameter specifications 586 | dfdes = pd.read_csv(os.path.join(inpath, fname_design)) 587 | # dfdes = pd.read_csv('designs_Danial/' + 'designtable_Nrun36.csv' ) 588 | 589 | # Filter out design parameters that are constant 590 | dfdes = dfdes[dfdes.columns[dfdes.nunique() > 1]].copy() 591 | 592 | # List of different predictable Y properties: 593 | try: 594 | ylabels = dfres["Y Label"].unique() 595 | except: 596 | print("No column with name 'Y Label' found in results file. Default results name 'Y1' added.") 597 | dfres["Y Label"] = 'Y1' 598 | ylabels = dfres["Y Label"].unique() 599 | params = list(dfdes)[1:] 600 | npar = len(params) 601 | nexp = dfdes.shape[0] 602 | 603 | # Calculating main stats (RMSE, parameter importance, best parameters) 604 | calc_expresults_stats(ylabels, dfdes, dfres, outpath) 605 | 606 | # Visualise correlation results for each Y predictable 607 | for ylabel in ylabels: 608 | print("Plotting correlation plots for Ylabel:" + str(ylabel) + " ...") 609 | dfname = os.path.join(outpath, "Experiment_" + str(ylabel) + "_RMSE.csv") 610 | df = pd.read_csv(dfname) 611 | # Plot Pairwise X correlation for Y: 612 | fname_out1 = (os.path.join( 613 | outpath, "Y-pairwise-correlation_" + str(ylabel) + ".png") 614 | ) 615 | plot_3dmap(df, params, "Y Exp Mean", fname_out1) 616 | # Plot Pairwise X correlation for RMSE 617 | fname_out2 = (os.path.join( 618 | outpath, "RMSE-pairwise-correlation_" + str(ylabel) + ".png") 619 | ) 620 | plot_3dmap(df, params, "RMSE", fname_out2) 621 | # Plot Main factor correlation plot with Y: 622 | fname_out3 = os.path.join(outpath, "Expresult_correlation_X-Y_" + str(ylabel) + ".png") 623 | 624 | plot_regression(df, params, 'Y Exp Mean', fname_out3) 625 | fname_out4 = os.path.join(outpath, "Expresult_distribution_X-RMSE_" + str(ylabel) + ".png") 626 | 627 | plot_factordis(df, params, 'RMSE', fname_out4) 628 | 629 | print("FINISHED") 630 | 631 | 632 | def main_cli(): 633 | ap = argparse.ArgumentParser() 634 | ap.add_argument('settings_path', nargs='?', default='settings_expresults.yaml') 635 | args = ap.parse_args() 636 | print(f"using settings in: {args.settings_path!r}") 637 | with open(args.settings_path) as f: 638 | cfg = yaml.safe_load(f) 639 | main(**cfg) 640 | 641 | 642 | if __name__ == "__main__": 643 | #from doegen import configloader_results as cfg 644 | main_cli() -------------------------------------------------------------------------------- /doegen/init_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run this initialisation of .yaml and .xlsx files after installation of doegen. 3 | 4 | Creates config yaml files and excel templates if not already existing in current working directory. 5 | """ 6 | import os 7 | import sys 8 | import shutil 9 | import doegen 10 | 11 | #Current working directory 12 | cwd = os.getcwd() 13 | 14 | #Directory where package doegen is installed: 15 | pckd = doegen.__path__[0] 16 | 17 | # Create settings_design.yaml 18 | if os.path.isfile('settings_design.yaml'): 19 | print('File settings_design.yaml already exists.') 20 | else: 21 | source = os.path.join(pckd,'settings_design.yaml') 22 | target = os.path.join(cwd,'settings_design.yaml') 23 | try: 24 | shutil.copy(source, target) 25 | print('Please edit settings_design.yaml') 26 | except IOError as e: 27 | print("Unable to copy file. %s" % e) 28 | except: 29 | print("Unexpected error:", sys.exc_info()) 30 | 31 | # Create Experiment_setup.xlsx 32 | if os.path.isfile('Experiment_setup.xlsx'): 33 | print('File Experiment_setup.xlsx already exist.') 34 | else: 35 | source = os.path.join(pckd,'Experiment_setup.xlsx') 36 | target = os.path.join(cwd,'Experiment_setup.xlsx') 37 | try: 38 | shutil.copy(source, target) 39 | print('Please add your experiment settings in Experiment_setup.xlsx') 40 | except IOError as e: 41 | print("Unable to copy file. %s" % e) 42 | except: 43 | print("Unexpected error:", sys.exc_info()) 44 | 45 | 46 | # Create settings_expresults.yaml 47 | if os.path.isfile('settings_expresults.yaml'): 48 | print('File settings_expresults.yaml already exists.') 49 | else: 50 | source = os.path.join(pckd,'settings_expresults.yaml') 51 | target = os.path.join(cwd,'settings_expresults.yaml') 52 | try: 53 | shutil.copy(source, target) 54 | print('Please edit settings_expresults.yaml after running the experiment.') 55 | except IOError as e: 56 | print("Unable to copy file. %s" % e) 57 | except: 58 | print("Unexpected error:", sys.exc_info()) 59 | 60 | # Create Experiment_results.xlsx 61 | if os.path.isfile('Experiment_results.xlsx'): 62 | print('File Experiment_results.xlsx already exist.') 63 | else: 64 | source = os.path.join(pckd,'Experiment_results.xlsx') 65 | target = os.path.join(cwd,'Experiment_results.xlsx') 66 | try: 67 | shutil.copy(source, target) 68 | print('Please add your experiment results in Experiment_results.xlsx after running the experiment.') 69 | except IOError as e: 70 | print("Unable to copy file. %s" % e) 71 | except: 72 | print("Unexpected error:", sys.exc_info()) -------------------------------------------------------------------------------- /doegen/init_tests.py: -------------------------------------------------------------------------------- 1 | """ 2 | Run this initialisation of .yaml and .xlsx files after installation of doegen. 3 | 4 | Creates config yaml files and excel templates if not already existing in current working directory. 5 | """ 6 | import os 7 | import sys 8 | import shutil 9 | import doegen 10 | 11 | #Current working directory 12 | cwd = os.getcwd() 13 | 14 | #Directory where package doegen is installed: 15 | pckd = doegen.__path__[0] 16 | 17 | # Create settings_design.yaml 18 | if os.path.exists('test'): 19 | print("directory already exist: test") 20 | else: 21 | source = os.path.join(pckd,'test') 22 | target = os.path.join(cwd,'test') 23 | try: 24 | shutil.copytree(source, target) 25 | print("Generated directory: test") 26 | except IOError as e: 27 | print("Unable to copy directory. %s" % e) 28 | except: 29 | print("Unexpected error:", sys.exc_info()) -------------------------------------------------------------------------------- /doegen/settings_design.yaml: -------------------------------------------------------------------------------- 1 | # Settings for Experiment Design Generation 2 | 3 | # Path to exp design setup file 4 | path: 'test/' 5 | # Set path for output files. If empty (''), output folder will be same as above for setup file 6 | outpath: 'test/design_results/' 7 | # Filename for exp setup file 8 | fname_setup: 'Experiment_setup.xlsx' 9 | # Maximum time of experiment runs: 10 | nrun_max: 150 11 | 12 | # Set maximum time for optimization per run (in seconds; recommended to set to at least ~100s) 13 | maxtime_per_run: 100 14 | 15 | # Set maximal stepsize of run size interval, so tyhat noty each runszie has teo be optimized 16 | # The larger the interval, the faster the total computation 17 | # by default (select delta_nrun = None) it will select automatically the interval step with the lowest common multiple of the levels 18 | # (e.g. if mix between level 2 and 3 thus will results in delta_nrun = 6) 19 | delta_nrun: None -------------------------------------------------------------------------------- /doegen/settings_expresults.yaml: -------------------------------------------------------------------------------- 1 | # Settings for Experiment Design Generation 2 | 3 | # Path to exp design setup file 4 | inpath: 'test/design_results/' 5 | # Set path for output files. Ff '', output folder will be same as above for setup file 6 | outpath: 'test/evaluation_results/' 7 | # Filename for exp design table in inpath 8 | fname_design: 'Designtable_optimal_Nrun72.csv' 9 | # Filename for exp results in inpath: 10 | fname_results: 'experiment_results_Nrun72.xlsx' -------------------------------------------------------------------------------- /doegen/test/Experiment_setup_test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/test/Experiment_setup_test.xlsx -------------------------------------------------------------------------------- /doegen/test/results/Designtable_best_Nrun18.csv: -------------------------------------------------------------------------------- 1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8 2 | 1,0.0,-10.0,1,1.0,L1,0,0,L1 3 | 2,3.0,-3.0,3,1.0,L2,0,0,L1 4 | 3,3.0,4.0,1,3.0,L2,1,1,L1 5 | 4,3.0,-10.0,3,5.0,L2,2,0,L1 6 | 5,6.0,4.0,3,3.0,L3,0,1,L1 7 | 6,6.0,-3.0,3,5.0,L1,0,0,L2 8 | 7,3.0,-3.0,1,3.0,L1,0,0,L2 9 | 8,3.0,-3.0,5,5.0,L1,1,1,L1 10 | 9,6.0,-10.0,1,5.0,L3,1,1,L1 11 | 10,0.0,4.0,5,5.0,L2,0,1,L2 12 | 11,6.0,-3.0,1,1.0,L2,2,1,L2 13 | 12,3.0,4.0,1,5.0,L3,2,0,L2 14 | 13,6.0,-10.0,5,3.0,L2,1,0,L2 15 | 14,0.0,-3.0,5,3.0,L3,2,0,L1 16 | 15,0.0,-10.0,3,3.0,L1,2,1,L2 17 | 16,6.0,4.0,5,1.0,L1,2,0,L1 18 | 17,0.0,4.0,3,1.0,L3,1,0,L2 19 | 18,3.0,-10.0,5,1.0,L3,0,1,L2 20 | -------------------------------------------------------------------------------- /doegen/test/results/Designtable_minimum_Nrun30.csv: -------------------------------------------------------------------------------- 1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8 2 | 1,0.0,-10.0,1,1.0,L1,0,0,L1 3 | 2,0.0,-3.0,3,1.0,L2,1,0,L1 4 | 3,3.0,-10.0,1,3.0,L2,0,0,L2 5 | 4,6.0,4.0,5,5.0,L2,2,0,L2 6 | 5,3.0,-10.0,5,5.0,L3,1,0,L1 7 | 6,0.0,-10.0,5,3.0,L1,2,0,L2 8 | 7,3.0,4.0,1,3.0,L1,1,1,L1 9 | 8,3.0,-10.0,1,3.0,L2,2,1,L1 10 | 9,3.0,-3.0,5,1.0,L1,0,0,L2 11 | 10,6.0,-10.0,1,1.0,L2,1,1,L2 12 | 11,3.0,-10.0,3,5.0,L3,1,0,L2 13 | 12,6.0,4.0,1,5.0,L3,0,0,L1 14 | 13,0.0,-3.0,3,5.0,L2,0,1,L1 15 | 14,6.0,-10.0,3,1.0,L3,0,1,L2 16 | 15,3.0,4.0,3,1.0,L1,2,0,L2 17 | 16,6.0,4.0,3,3.0,L1,1,1,L1 18 | 17,3.0,4.0,5,1.0,L2,0,1,L1 19 | 18,6.0,-10.0,5,1.0,L3,2,1,L1 20 | 19,0.0,-10.0,5,5.0,L1,1,1,L1 21 | 20,3.0,-3.0,3,3.0,L3,1,1,L2 22 | 21,3.0,-3.0,5,1.0,L3,2,1,L1 23 | 22,0.0,4.0,5,3.0,L3,0,1,L2 24 | 23,3.0,-3.0,1,5.0,L1,0,1,L2 25 | 24,6.0,-3.0,1,5.0,L1,2,1,L2 26 | 25,0.0,-10.0,3,5.0,L2,2,1,L2 27 | 26,0.0,4.0,1,1.0,L3,1,0,L2 28 | 27,6.0,-3.0,5,3.0,L2,1,0,L2 29 | 28,3.0,4.0,3,5.0,L2,2,0,L1 30 | 29,0.0,-3.0,1,3.0,L3,2,0,L1 31 | 30,6.0,-10.0,3,3.0,L1,0,0,L1 32 | -------------------------------------------------------------------------------- /doegen/test/results/Designtable_optimal_Nrun72.csv: -------------------------------------------------------------------------------- 1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8 2 | 1,0.0,-10.0,1,1.0,L1,0,0,L1 3 | 2,3.0,-3.0,3,3.0,L1,1,1,L2 4 | 3,0.0,4.0,5,5.0,L1,2,0,L1 5 | 4,3.0,-3.0,3,3.0,L2,2,1,L1 6 | 5,6.0,-3.0,5,1.0,L3,0,1,L2 7 | 6,0.0,4.0,3,5.0,L3,1,1,L1 8 | 7,6.0,-10.0,3,3.0,L3,1,1,L1 9 | 8,0.0,-3.0,1,1.0,L1,2,1,L2 10 | 9,3.0,4.0,5,5.0,L2,1,1,L2 11 | 10,6.0,-3.0,1,3.0,L2,1,0,L1 12 | 11,3.0,-10.0,3,5.0,L1,1,0,L2 13 | 12,0.0,-10.0,3,1.0,L2,2,1,L2 14 | 13,0.0,4.0,3,5.0,L2,1,0,L1 15 | 14,3.0,-10.0,1,5.0,L2,2,1,L1 16 | 15,0.0,-10.0,3,3.0,L1,1,0,L2 17 | 16,6.0,-10.0,5,5.0,L2,0,0,L2 18 | 17,6.0,-10.0,3,5.0,L3,0,1,L2 19 | 18,6.0,4.0,1,3.0,L3,2,1,L2 20 | 19,0.0,-3.0,3,3.0,L3,2,1,L1 21 | 20,0.0,-3.0,3,5.0,L1,0,0,L1 22 | 21,3.0,-10.0,5,1.0,L3,2,0,L1 23 | 22,3.0,-10.0,5,3.0,L2,1,1,L1 24 | 23,6.0,-3.0,3,5.0,L2,0,0,L1 25 | 24,3.0,4.0,3,1.0,L3,2,0,L2 26 | 25,0.0,-10.0,1,3.0,L3,2,0,L2 27 | 26,0.0,4.0,3,5.0,L3,2,0,L2 28 | 27,6.0,4.0,1,1.0,L2,1,0,L2 29 | 28,3.0,-3.0,5,1.0,L3,2,0,L1 30 | 29,3.0,4.0,3,3.0,L1,2,0,L2 31 | 30,0.0,-3.0,5,5.0,L3,0,1,L2 32 | 31,6.0,4.0,3,5.0,L1,2,0,L1 33 | 32,6.0,-10.0,5,3.0,L1,2,1,L1 34 | 33,3.0,-3.0,3,3.0,L2,0,0,L2 35 | 34,3.0,-3.0,1,5.0,L3,0,1,L2 36 | 35,0.0,-10.0,1,5.0,L2,1,0,L2 37 | 36,0.0,-3.0,5,1.0,L2,1,0,L1 38 | 37,0.0,4.0,1,1.0,L2,2,1,L1 39 | 38,6.0,-3.0,5,1.0,L1,1,0,L2 40 | 39,0.0,-10.0,5,3.0,L3,0,1,L1 41 | 40,0.0,4.0,5,3.0,L2,0,0,L1 42 | 41,3.0,4.0,3,1.0,L3,1,1,L1 43 | 42,0.0,-10.0,1,1.0,L3,0,0,L1 44 | 43,0.0,4.0,5,3.0,L2,0,1,L1 45 | 44,3.0,-3.0,5,5.0,L3,1,0,L1 46 | 45,6.0,4.0,5,1.0,L1,1,1,L1 47 | 46,6.0,-10.0,1,3.0,L3,1,0,L1 48 | 47,6.0,4.0,1,3.0,L3,2,0,L1 49 | 48,0.0,-10.0,3,1.0,L1,1,1,L2 50 | 49,6.0,-3.0,3,3.0,L2,2,1,L2 51 | 50,6.0,-3.0,5,5.0,L2,2,1,L1 52 | 51,0.0,-3.0,1,5.0,L3,1,1,L2 53 | 52,6.0,4.0,3,1.0,L2,0,1,L2 54 | 53,6.0,-10.0,3,1.0,L1,0,1,L2 55 | 54,6.0,-10.0,1,5.0,L1,1,1,L1 56 | 55,3.0,-10.0,5,5.0,L3,0,0,L2 57 | 56,0.0,-3.0,5,3.0,L1,2,0,L2 58 | 57,3.0,-3.0,1,3.0,L1,0,1,L1 59 | 58,3.0,-10.0,3,1.0,L2,0,0,L1 60 | 59,6.0,4.0,5,3.0,L3,1,0,L2 61 | 60,3.0,4.0,5,1.0,L3,1,1,L2 62 | 61,3.0,-3.0,1,1.0,L1,1,0,L1 63 | 62,3.0,-10.0,5,5.0,L1,2,1,L1 64 | 63,6.0,-3.0,3,1.0,L3,0,0,L1 65 | 64,0.0,4.0,5,3.0,L1,0,1,L2 66 | 65,6.0,4.0,1,5.0,L1,0,1,L1 67 | 66,0.0,-3.0,1,1.0,L2,1,1,L2 68 | 67,3.0,4.0,1,3.0,L1,0,0,L2 69 | 68,3.0,4.0,1,3.0,L2,0,0,L2 70 | 69,6.0,-10.0,5,1.0,L2,2,0,L2 71 | 70,3.0,-10.0,1,5.0,L2,2,1,L2 72 | 71,6.0,-3.0,1,5.0,L1,2,0,L2 73 | 72,3.0,4.0,1,1.0,L1,0,1,L1 74 | -------------------------------------------------------------------------------- /doegen/test/results/experiment_results_Nrun72.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/test/results/experiment_results_Nrun72.xlsx -------------------------------------------------------------------------------- /doegen/test/settings_design_test.yaml: -------------------------------------------------------------------------------- 1 | # Settings for Experiment Design Generation 2 | 3 | # Path to exp design setup file 4 | path: 'test/' 5 | # Set path for output files. If empty (''), output folder will be same as above for setup file 6 | outpath: 'test/results_test/' 7 | # Filename for exp setup file 8 | fname_setup: 'Experiment_setup_test.xlsx' 9 | # Maximum number of experimental runs: 10 | nrun_max: 80 11 | 12 | # Set maximum time for optimization per run (in seconds, recommended to set to at least ~100s) 13 | maxtime_per_run: 80 14 | 15 | # Set maximal stepsize of run size interval, so tyhat noty each runszie has teo be optimized 16 | # The larger the interval, the faster the total computation 17 | # by default (select delta_nrun = None) it will select automatically the interval step with the lowest common multiple of the levels 18 | # (e.g. if mix between level 2 and 3 thus will results in delta_nrun = 6) 19 | delta_nrun: None -------------------------------------------------------------------------------- /doegen/test/settings_expresults_test.yaml: -------------------------------------------------------------------------------- 1 | # Settings for Experiment Design Generation 2 | 3 | # Path to exp design setup file 4 | inpath: 'test/results/' 5 | # Set path for output files. Ff '', output folder will be same as above for setup file 6 | outpath: 'test/expresults2/' 7 | # Filename for exp design table in inpath 8 | fname_design: 'Designtable_optimal_Nrun72.csv' 9 | # Filename for exp results in inpath: 10 | fname_results: 'experiment_results_Nrun72.xlsx' -------------------------------------------------------------------------------- /figures/BestFactor_Avg1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/BestFactor_Avg1.png -------------------------------------------------------------------------------- /figures/Designtable_optimal_Nrun72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Designtable_optimal_Nrun72.png -------------------------------------------------------------------------------- /figures/Efficiencies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Efficiencies.png -------------------------------------------------------------------------------- /figures/Efficiencies_[3, 3, 3, 3, 3, 3, 2, 2].png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Efficiencies_[3, 3, 3, 3, 3, 3, 2, 2].png -------------------------------------------------------------------------------- /figures/Experiment_result_Nrun72_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Experiment_result_Nrun72_header.png -------------------------------------------------------------------------------- /figures/Expresult_correlation_X_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Expresult_correlation_X_1.png -------------------------------------------------------------------------------- /figures/Expresult_pairwise-correlation_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Expresult_pairwise-correlation_1.png -------------------------------------------------------------------------------- /figures/Result_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Result_header.png -------------------------------------------------------------------------------- /figures/Results_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Results_overview.png -------------------------------------------------------------------------------- /figures/Setup_header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Setup_header.png -------------------------------------------------------------------------------- /figures/Setup_header_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Setup_header_test.png -------------------------------------------------------------------------------- /figures/Top10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Top10.png -------------------------------------------------------------------------------- /figures/Ybarplot_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Ybarplot_1.png -------------------------------------------------------------------------------- /figures/pairwise_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/pairwise_correlation.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | xlrd==1.2.0 2 | matplotlib==3.3.4 3 | pandas>=1.2.3 4 | XlsxWriter==1.2.8 5 | OApackage>=2.7.7 6 | numpy>=1.22.0 7 | seaborn==0.11.1 8 | PyYAML==5.4.1 9 | scikit_learn==0.24.1 10 | openpyxl==3.0.7 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup, find_packages 3 | except ImportError: 4 | from distutils.core import setup 5 | from os import path 6 | import os 7 | import subprocess 8 | import io 9 | 10 | ## in development set version 11 | PYPI_VERSION = '0.5.0' 12 | 13 | # Return the git revision as a string (from numpy) 14 | def git_version(): 15 | def _minimal_ext_cmd(cmd): 16 | # construct minimal environment 17 | env = {} 18 | for k in ['SYSTEMROOT', 'PATH']: 19 | v = os.environ.get(k) 20 | if v is not None: 21 | env[k] = v 22 | # LANGUAGE is used on win32 23 | env['LANGUAGE'] = 'C' 24 | env['LANG'] = 'C' 25 | env['LC_ALL'] = 'C' 26 | out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0] 27 | return out 28 | 29 | try: 30 | out = _minimal_ext_cmd(['git', 'rev-parse', '--short', 'HEAD']) 31 | GIT_REVISION = out.strip().decode('ascii') 32 | except OSError: 33 | GIT_REVISION = "Unknown" 34 | 35 | return GIT_REVISION 36 | 37 | 38 | if PYPI_VERSION is None: 39 | PYPI_VERSION = git_version() 40 | 41 | 42 | this_directory = path.abspath(path.dirname(__file__)) 43 | with io.open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 44 | long_description = f.read() 45 | 46 | packages = find_packages() 47 | 48 | if __name__ == "__main__": 49 | setup(name = 'DoEgen', 50 | author = "Sebastian Haan", 51 | author_email = "sebastian.haan@sydney.edu.au", 52 | url = "https://github.com/sebhaan/DoEgen", 53 | version = PYPI_VERSION, 54 | description = "DoEgen: A Python Library for Optimised Design of Experiment Generation and Evaluation", 55 | long_description = long_description, 56 | long_description_content_type='text/markdown', 57 | install_requires = ['numpy>=1.16.3', 58 | 'xlrd==1.2.0', 59 | 'pandas>=1.0.3', 60 | 'XlsxWriter>=1.2.8', 61 | 'openpyxl>=3.0.7', 62 | 'seaborn>=0.11.1', 63 | 'OApackage>=2.7.11', 64 | 'tabulate>=0.8.9', 65 | 'matplotlib>=3.1.0', 66 | 'PyYAML>=5.3.1', 67 | 'scikit_learn>=0.22.2.post1'], 68 | python_requires = '>=3.6', 69 | setup_requires = ["pytest-runner", 'webdav'], 70 | tests_require = ["pytest", 'webdav'], 71 | packages = ['doegen'], 72 | package_data = {'doegen': ['*.yaml', 73 | '*.xlsx', 74 | 'test/Experiment_setup_test.xlsx', 75 | 'test/settings_design_test.yaml', 76 | 'test/settings_expresults_test.yaml', 77 | 'test/results/experiment_results_Nrun72.xlsx', 78 | 'test/results/Designtable_optimal_Nrun72.csv']}, 79 | include_package_data = False, 80 | classifiers = ['Programming Language :: Python :: 3', 81 | 'Programming Language :: Python :: 3.6', 82 | 'Programming Language :: Python :: 3.7' 83 | ] 84 | ) --------------------------------------------------------------------------------