├── .gitignore
├── LICENSE.md
├── MANUAL.md
├── README.md
├── docker
├── Dockerfile
└── project
│ └── test
│ ├── Experiment_setup_test.xls
│ ├── Experiment_setup_test.xlsx
│ ├── results
│ ├── Designtable_optimal_Nrun72.csv
│ ├── experiment_results_Nrun72.xls
│ └── experiment_results_Nrun72.xlsx
│ ├── settings_design_test.yaml
│ └── settings_expresults_test.yaml
├── docs
├── DoEgen_explained
│ ├── 01_experiment_setup_definition_.md
│ ├── 02_design_generation_.md
│ ├── 03_design_evaluation___efficiency_metrics_.md
│ ├── 04_design_selection_.md
│ ├── 05_experiment_result_input___merging_.md
│ ├── 06_result_analysis___statistics_.md
│ ├── 07_result_visualization_.md
│ ├── 08_configuration_handling_.md
│ └── index.md
├── MANUAL.md
└── MANUAL.pdf
├── doegen
├── Experiment_results.xlsx
├── Experiment_setup.xlsx
├── Experiment_setup_extended.xlsx
├── __init__.py
├── configloader.py
├── configloader_results.py
├── create_resultfile.py
├── create_setupfile.py
├── create_setupfile_extended.py
├── doegen.py
├── doeval.py
├── init_config.py
├── init_tests.py
├── settings_design.yaml
├── settings_expresults.yaml
└── test
│ ├── Experiment_setup_test.xlsx
│ ├── results
│ ├── Designtable_best_Nrun18.csv
│ ├── Designtable_minimum_Nrun30.csv
│ ├── Designtable_optimal_Nrun72.csv
│ └── experiment_results_Nrun72.xlsx
│ ├── settings_design_test.yaml
│ └── settings_expresults_test.yaml
├── figures
├── BestFactor_Avg1.png
├── Designtable_optimal_Nrun72.png
├── Efficiencies.png
├── Efficiencies_[3, 3, 3, 3, 3, 3, 2, 2].png
├── Experiment_result_Nrun72_header.png
├── Expresult_correlation_X_1.png
├── Expresult_pairwise-correlation_1.png
├── Result_header.png
├── Results_overview.png
├── Setup_header.png
├── Setup_header_test.png
├── Top10.png
├── Ybarplot_1.png
└── pairwise_correlation.png
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .eggs/*
2 | build/
3 | *.egg
4 | DoEgen.egg-info/dependency_links.txt
5 | DoEgen.egg-info/PKG-INFO
6 | DoEgen.egg-info/requires.txt
7 | DoEgen.egg-info/SOURCES.txt
8 | DoEgen.egg-info/top_level.txt
9 | build/lib/doegen/doegen.py
10 | build/lib/doegen/doegen.py
11 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | ### GNU LESSER GENERAL PUBLIC LICENSE
2 |
3 | Version 3, 29 June 2007
4 |
5 | Copyright (C) 2007 Free Software Foundation, Inc.
6 |
7 |
8 | Everyone is permitted to copy and distribute verbatim copies of this
9 | license document, but changing it is not allowed.
10 |
11 | This version of the GNU Lesser General Public License incorporates the
12 | terms and conditions of version 3 of the GNU General Public License,
13 | supplemented by the additional permissions listed below.
14 |
15 | #### 0. Additional Definitions.
16 |
17 | As used herein, "this License" refers to version 3 of the GNU Lesser
18 | General Public License, and the "GNU GPL" refers to version 3 of the
19 | GNU General Public License.
20 |
21 | "The Library" refers to a covered work governed by this License, other
22 | than an Application or a Combined Work as defined below.
23 |
24 | An "Application" is any work that makes use of an interface provided
25 | by the Library, but which is not otherwise based on the Library.
26 | Defining a subclass of a class defined by the Library is deemed a mode
27 | of using an interface provided by the Library.
28 |
29 | A "Combined Work" is a work produced by combining or linking an
30 | Application with the Library. The particular version of the Library
31 | with which the Combined Work was made is also called the "Linked
32 | Version".
33 |
34 | The "Minimal Corresponding Source" for a Combined Work means the
35 | Corresponding Source for the Combined Work, excluding any source code
36 | for portions of the Combined Work that, considered in isolation, are
37 | based on the Application, and not on the Linked Version.
38 |
39 | The "Corresponding Application Code" for a Combined Work means the
40 | object code and/or source code for the Application, including any data
41 | and utility programs needed for reproducing the Combined Work from the
42 | Application, but excluding the System Libraries of the Combined Work.
43 |
44 | #### 1. Exception to Section 3 of the GNU GPL.
45 |
46 | You may convey a covered work under sections 3 and 4 of this License
47 | without being bound by section 3 of the GNU GPL.
48 |
49 | #### 2. Conveying Modified Versions.
50 |
51 | If you modify a copy of the Library, and, in your modifications, a
52 | facility refers to a function or data to be supplied by an Application
53 | that uses the facility (other than as an argument passed when the
54 | facility is invoked), then you may convey a copy of the modified
55 | version:
56 |
57 | - a) under this License, provided that you make a good faith effort
58 | to ensure that, in the event an Application does not supply the
59 | function or data, the facility still operates, and performs
60 | whatever part of its purpose remains meaningful, or
61 | - b) under the GNU GPL, with none of the additional permissions of
62 | this License applicable to that copy.
63 |
64 | #### 3. Object Code Incorporating Material from Library Header Files.
65 |
66 | The object code form of an Application may incorporate material from a
67 | header file that is part of the Library. You may convey such object
68 | code under terms of your choice, provided that, if the incorporated
69 | material is not limited to numerical parameters, data structure
70 | layouts and accessors, or small macros, inline functions and templates
71 | (ten or fewer lines in length), you do both of the following:
72 |
73 | - a) Give prominent notice with each copy of the object code that
74 | the Library is used in it and that the Library and its use are
75 | covered by this License.
76 | - b) Accompany the object code with a copy of the GNU GPL and this
77 | license document.
78 |
79 | #### 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that, taken
82 | together, effectively do not restrict modification of the portions of
83 | the Library contained in the Combined Work and reverse engineering for
84 | debugging such modifications, if you also do each of the following:
85 |
86 | - a) Give prominent notice with each copy of the Combined Work that
87 | the Library is used in it and that the Library and its use are
88 | covered by this License.
89 | - b) Accompany the Combined Work with a copy of the GNU GPL and this
90 | license document.
91 | - c) For a Combined Work that displays copyright notices during
92 | execution, include the copyright notice for the Library among
93 | these notices, as well as a reference directing the user to the
94 | copies of the GNU GPL and this license document.
95 | - d) Do one of the following:
96 | - 0) Convey the Minimal Corresponding Source under the terms of
97 | this License, and the Corresponding Application Code in a form
98 | suitable for, and under terms that permit, the user to
99 | recombine or relink the Application with a modified version of
100 | the Linked Version to produce a modified Combined Work, in the
101 | manner specified by section 6 of the GNU GPL for conveying
102 | Corresponding Source.
103 | - 1) Use a suitable shared library mechanism for linking with
104 | the Library. A suitable mechanism is one that (a) uses at run
105 | time a copy of the Library already present on the user's
106 | computer system, and (b) will operate properly with a modified
107 | version of the Library that is interface-compatible with the
108 | Linked Version.
109 | - e) Provide Installation Information, but only if you would
110 | otherwise be required to provide such information under section 6
111 | of the GNU GPL, and only to the extent that such information is
112 | necessary to install and execute a modified version of the
113 | Combined Work produced by recombining or relinking the Application
114 | with a modified version of the Linked Version. (If you use option
115 | 4d0, the Installation Information must accompany the Minimal
116 | Corresponding Source and Corresponding Application Code. If you
117 | use option 4d1, you must provide the Installation Information in
118 | the manner specified by section 6 of the GNU GPL for conveying
119 | Corresponding Source.)
120 |
121 | #### 5. Combined Libraries.
122 |
123 | You may place library facilities that are a work based on the Library
124 | side by side in a single library together with other library
125 | facilities that are not Applications and are not covered by this
126 | License, and convey such a combined library under terms of your
127 | choice, if you do both of the following:
128 |
129 | - a) Accompany the combined library with a copy of the same work
130 | based on the Library, uncombined with any other library
131 | facilities, conveyed under the terms of this License.
132 | - b) Give prominent notice with the combined library that part of it
133 | is a work based on the Library, and explaining where to find the
134 | accompanying uncombined form of the same work.
135 |
136 | #### 6. Revised Versions of the GNU Lesser General Public License.
137 |
138 | The Free Software Foundation may publish revised and/or new versions
139 | of the GNU Lesser General Public License from time to time. Such new
140 | versions will be similar in spirit to the present version, but may
141 | differ in detail to address new problems or concerns.
142 |
143 | Each version is given a distinguishing version number. If the Library
144 | as you received it specifies that a certain numbered version of the
145 | GNU Lesser General Public License "or any later version" applies to
146 | it, you have the option of following the terms and conditions either
147 | of that published version or of any later version published by the
148 | Free Software Foundation. If the Library as you received it does not
149 | specify a version number of the GNU Lesser General Public License, you
150 | may choose any version of the GNU Lesser General Public License ever
151 | published by the Free Software Foundation.
152 |
153 | If the Library as you received it specifies that a proxy can decide
154 | whether future versions of the GNU Lesser General Public License shall
155 | apply, that proxy's public statement of acceptance of any version is
156 | permanent authorization for you to choose that version for the
157 | Library.
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:1
2 | FROM continuumio/miniconda3
3 |
4 | # Install conda environment
5 | RUN conda create -n doegen_app python=3.7
6 |
7 | # Activate conda environment and install swig and numpy
8 | ENV PATH /opt/conda/envs/doegen_app/bin:$PATH
9 | RUN /bin/bash -c ". activate doegen_app" && \
10 | conda install --yes swig
11 |
12 | # Install DoEgen (incl OApackage) from PyPi
13 | # Numpy and gcc must be explicitely installed before installing OApackage
14 | RUN apt-get update && apt-get install -y g++
15 | RUN pip install numpy
16 | # DoEgen install might take a few minutes since OApackage wheel building very long
17 | RUN pip install DoEgen
18 |
19 | WORKDIR project/
20 | ENTRYPOINT ["python", "-m"]
21 |
22 | # Give default arguments, in case none are supplied on
23 | # the command-line, e.g.
24 | CMD ["doegen.init_tests"]
25 |
26 |
27 | #HOW TO BUILD (IN SHELL), e.g.:
28 | #docker build -t doegen-app:v1 .
29 |
30 | #HOW TO RUN (IN SHELL), e.g.:
31 | #docker run -it -v /project:/project doegen-app:v1 doegen.doegen .yaml
32 | #docker run -it -v /project:/project doegen-app:v1 doegen.doeval .yaml
33 |
34 |
--------------------------------------------------------------------------------
/docker/project/test/Experiment_setup_test.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/Experiment_setup_test.xls
--------------------------------------------------------------------------------
/docker/project/test/Experiment_setup_test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/Experiment_setup_test.xlsx
--------------------------------------------------------------------------------
/docker/project/test/results/Designtable_optimal_Nrun72.csv:
--------------------------------------------------------------------------------
1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8
2 | 1,0,-10,1,1,L1,0,0,L1
3 | 2,3,-3,3,1,L2,0,1,L2
4 | 3,0,-10,3,3,L3,0,1,L2
5 | 4,6,-10,3,5,L3,1,1,L2
6 | 5,0,4,3,1,L3,0,0,L1
7 | 6,3,-3,5,1,L3,2,0,L2
8 | 7,0,-3,3,3,L2,2,1,L2
9 | 8,6,4,1,1,L1,2,1,L2
10 | 9,3,4,5,3,L2,1,1,L1
11 | 10,6,4,5,5,L1,0,0,L2
12 | 11,3,4,1,1,L2,0,0,L2
13 | 12,6,-3,1,5,L2,2,0,L1
14 | 13,6,-3,3,1,L1,0,0,L1
15 | 14,6,-10,1,1,L2,2,1,L1
16 | 15,3,-10,1,5,L1,1,1,L1
17 | 16,6,-3,1,3,L1,0,1,L1
18 | 17,3,-3,1,3,L3,2,0,L2
19 | 18,0,-3,1,5,L1,1,0,L2
20 | 19,0,-3,3,5,L2,0,1,L1
21 | 20,0,-3,1,5,L3,1,0,L2
22 | 21,6,-3,5,1,L2,1,0,L2
23 | 22,6,-3,3,3,L3,2,0,L1
24 | 23,6,-10,1,3,L2,0,1,L2
25 | 24,6,4,3,1,L2,1,0,L2
26 | 25,6,4,5,3,L1,2,1,L2
27 | 26,6,-10,3,3,L2,2,1,L2
28 | 27,0,4,3,1,L1,1,1,L1
29 | 28,6,-10,5,5,L3,0,0,L2
30 | 29,6,4,1,5,L3,2,0,L1
31 | 30,6,-3,5,5,L2,1,1,L1
32 | 31,0,4,5,3,L2,0,0,L1
33 | 32,0,-3,3,5,L1,2,1,L2
34 | 33,0,4,1,5,L3,1,1,L2
35 | 34,0,4,5,5,L2,2,0,L2
36 | 35,6,-10,5,5,L1,2,1,L1
37 | 36,3,-10,5,1,L3,1,1,L2
38 | 37,0,-10,5,3,L1,1,0,L1
39 | 38,3,-10,5,1,L3,2,0,L1
40 | 39,3,4,5,3,L1,2,1,L1
41 | 40,3,-10,5,1,L1,1,0,L2
42 | 41,3,-10,3,5,L3,1,0,L1
43 | 42,0,-10,3,1,L2,2,0,L1
44 | 43,0,-3,1,1,L1,1,0,L2
45 | 44,6,4,5,3,L3,0,0,L2
46 | 45,3,-10,3,3,L1,2,0,L2
47 | 46,3,-3,3,3,L2,1,1,L1
48 | 47,3,4,1,5,L2,2,0,L1
49 | 48,6,4,3,3,L1,1,0,L1
50 | 49,3,-3,5,5,L3,0,1,L1
51 | 50,3,-3,1,1,L1,0,1,L1
52 | 51,0,4,5,1,L3,0,1,L1
53 | 52,0,4,1,3,L3,1,1,L2
54 | 53,3,4,3,5,L2,0,0,L2
55 | 54,0,-3,5,3,L1,0,0,L1
56 | 55,6,-3,3,3,L3,1,0,L2
57 | 56,3,-3,1,3,L3,0,1,L1
58 | 57,0,4,3,1,L3,2,1,L1
59 | 58,3,4,1,5,L2,0,1,L2
60 | 59,3,4,3,5,L1,2,0,L1
61 | 60,0,-10,5,1,L2,0,1,L2
62 | 61,3,-10,3,5,L1,0,0,L2
63 | 62,6,-10,3,5,L3,0,1,L1
64 | 63,3,4,3,3,L1,1,1,L2
65 | 64,3,-10,1,3,L2,1,0,L1
66 | 65,3,-3,5,1,L3,2,1,L2
67 | 66,6,-3,5,1,L2,1,0,L1
68 | 67,0,-3,5,5,L1,2,1,L2
69 | 68,6,-10,1,1,L1,0,1,L2
70 | 69,6,4,1,1,L3,1,1,L1
71 | 70,0,-10,1,3,L2,2,0,L2
72 | 71,0,-10,5,5,L2,1,1,L1
73 | 72,0,-10,1,3,L3,2,0,L1
74 |
--------------------------------------------------------------------------------
/docker/project/test/results/experiment_results_Nrun72.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/results/experiment_results_Nrun72.xls
--------------------------------------------------------------------------------
/docker/project/test/results/experiment_results_Nrun72.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docker/project/test/results/experiment_results_Nrun72.xlsx
--------------------------------------------------------------------------------
/docker/project/test/settings_design_test.yaml:
--------------------------------------------------------------------------------
1 | # Settings for Experiment Design Generation
2 |
3 | # Path to exp design setup file
4 | path: 'test/'
5 | # Set path for output files. If empty (''), output folder will be same as above for setup file
6 | outpath: 'test/results_test/'
7 | # Filename for exp setup file
8 | fname_setup: 'Experiment_setup_test.xlsx'
9 | # Maximum number of experimental runs:
10 | nrun_max: 20
11 |
12 | # Set maximum time for optimization per run (in seconds, recommended to set to at least ~100s)
13 | maxtime_per_run: 80
14 |
15 | # Set maximal stepsize of run size interval, so tyhat noty each runszie has teo be optimized
16 | # The larger the interval, the faster the total computation
17 | # by default (select delta_nrun = None) it will select automatically the interval step with the lowest common multiple of the levels
18 | # (e.g. if mix between level 2 and 3 thus will results in delta_nrun = 6)
19 | delta_nrun: None
--------------------------------------------------------------------------------
/docker/project/test/settings_expresults_test.yaml:
--------------------------------------------------------------------------------
1 | # Settings for Experiment Design Generation
2 |
3 | # Path to exp design setup file
4 | inpath: 'test/results/'
5 | # Set path for output files. Ff '', output folder will be same as above for setup file
6 | outpath: 'test/doeval_results/'
7 | # Filename for exp design table in inpath
8 | fname_design: 'Designtable_optimal_Nrun72.csv'
9 | # Filename for exp results in inpath:
10 | fname_results: 'experiment_results_Nrun72.xlsx'
--------------------------------------------------------------------------------
/docs/DoEgen_explained/01_experiment_setup_definition_.md:
--------------------------------------------------------------------------------
1 | # Chapter 1: Experiment Setup Definition
2 |
3 | Welcome to the `DoEgen` tutorial! Before we can start designing clever experiments, we first need to clearly tell `DoEgen` *what* we want to test. This chapter is all about defining the structure and ingredients of your experiment.
4 |
5 | ## What's the Big Idea? Planning Your Experiment Recipe
6 |
7 | Imagine you want to bake the perfect cake. You wouldn't just randomly throw ingredients into a bowl! You'd start with a recipe that lists:
8 | * **Ingredients:** Flour, Sugar, Eggs, Temperature, Baking Time...
9 | * **Amounts/Settings:** 2 cups of Flour, 1 cup of Sugar, Bake at 180°C for 30 minutes...
10 | * **Types:** White Flour vs. Whole Wheat, Granulated Sugar vs. Brown Sugar...
11 |
12 | Defining an **Experiment Setup** in `DoEgen` is exactly like creating this recipe list for your experiment. It tells `DoEgen` precisely:
13 | * What factors (variables or "ingredients") you want to change and study.
14 | * What different values or types (levels) each factor can have.
15 | * Whether a factor is a number (like temperature) or a category (like material type).
16 |
17 | This definition is the absolute foundation for everything that follows. Without a clear setup, `DoEgen` can't generate an efficient plan for your experiments.
18 |
19 | ## Key Ingredients of an Experiment Setup
20 |
21 | Let's break down the components you need to define:
22 |
23 | 1. **Factors:** These are the variables you control and want to investigate in your experiment.
24 | * *Analogy:* In our cake example, `Temperature`, `Sugar Amount`, and `Flour Type` are factors.
25 | * *Example:* If studying plant growth, factors might be `Water Amount`, `Sunlight Hours`, `Fertilizer Type`.
26 |
27 | 2. **Factor Types:** Factors can be different kinds:
28 | * **Numeric:** Represented by numbers.
29 | * **Continuous:** Can take any value within a range (e.g., Temperature: 20.5°C, 31.2°C, etc.).
30 | * **Discrete:** Can only take specific numeric values, often whole numbers (e.g., Number of Seeds planted: 1, 2, 3).
31 | * **Categorical:** Represented by distinct categories or labels, not numbers (e.g., Fertilizer Type: 'Brand A', 'Brand B', 'Organic').
32 |
33 | 3. **Levels:** These are the specific values or settings you choose to test for each factor.
34 | * *Analogy:* For the `Temperature` factor, you might test 3 levels: 170°C, 180°C, 190°C. For `Flour Type`, you might test 2 levels: 'White', 'Whole Wheat'.
35 | * The **number of levels** tells `DoEgen` how many different settings you want to examine for that factor. More levels allow for more detailed analysis but usually require more experiments.
36 |
37 | 4. **Ranges / Specific Values:** How you define the levels depends on the factor type:
38 | * **Numeric:** You typically provide a `Minimum` and `Maximum` value. `DoEgen` then calculates evenly spaced levels based on the `Level Number` you specified. For example, if Temperature is Continuous, has 3 Levels, Min=20, Max=40, the levels might be 20, 30, 40.
39 | * **Categorical:** You usually list the exact names of the categories (levels). For example, if Fertilizer Type has 2 levels, you might specify them as 'Brand A', 'Brand B'.
40 |
41 | ## The Experiment Setup Excel Template
42 |
43 | The easiest way to give `DoEgen` this "recipe" is by filling out a simple Excel spreadsheet. `DoEgen` provides template files to get you started.
44 |
45 | You can create a blank template by running a helper script included with `DoEgen`:
46 |
47 | ```bash
48 | # Run this command in your terminal in the DoEgen project directory
49 | python -m doegen.create_setupfile
50 | # Or for an extended version with more options:
51 | # python -m doegen.create_setupfile_extended
52 | ```
53 |
54 | This creates an Excel file (like `Experiment_setup_template.xlsx`) with the necessary columns. Here's what the main columns mean:
55 |
56 | | Column Header | Description | Example |
57 | | :--------------- | :---------------------------------------------------------------------------------------------------------------------------------------- | :---------------- |
58 | | `Parameter Name` | The name of your factor. | `Temperature` |
59 | | `Parameter Type` | The type of factor: `Continuous`, `Discrete`, or `Categorical`. | `Continuous` |
60 | | `Level Number` | How many different values/settings you want to test for this factor. | `3` |
61 | | `Minimum` | The lowest value for Numeric factors. (Leave blank for Categorical). | `20` |
62 | | `Maximum` | The highest value for Numeric factors. (Leave blank for Categorical). | `40` |
63 | | `Levels` (Optional) | For Categorical factors, list the exact level names separated by commas. Can also be used for specific numeric levels. | `Brand A, Brand B` |
64 | | `Include (Y/N)` (Optional) | Set to 'No' if you want to list a factor but *not* vary it in this specific design. Defaults to 'Yes'. | `Yes` |
65 |
66 | Here's how you might fill it out for a simple experiment:
67 |
68 | *(Based on the image from `MANUAL.md`)*
69 | {width=600}
70 |
71 | **Example Fill-out:**
72 |
73 | | Parameter Name | Parameter Type | Level Number | Minimum | Maximum | Levels |
74 | | :------------- | :------------- | :----------- | :------ | :------ | :--------------------- |
75 | | Temperature | Continuous | 3 | 20 | 40 | |
76 | | Pressure | Discrete | 2 | 1 | 5 | |
77 | | Catalyst | Categorical | 2 | | | Catalyst X, Catalyst Y |
78 | | Speed | Continuous | 3 | 100 | 300 | |
79 |
80 | This table clearly tells `DoEgen`:
81 | * We have 4 factors: Temperature, Pressure, Catalyst, Speed.
82 | * Temperature is continuous, tested at 3 levels between 20 and 40.
83 | * Pressure is discrete, tested at 2 levels between 1 and 5.
84 | * Catalyst is categorical, tested with 'Catalyst X' and 'Catalyst Y'.
85 | * Speed is continuous, tested at 3 levels between 100 and 300.
86 |
87 | ## How `DoEgen` Reads Your Recipe (Simplified View)
88 |
89 | Under the hood, `DoEgen` uses Python code to read this Excel file and understand your experimental setup. It primarily uses the `pandas` library to handle the spreadsheet data.
90 |
91 | Here's a very simplified Python snippet illustrating the core idea (the actual code in `doegen/doegen.py` within the `read_setup_new` function is more detailed):
92 |
93 | ```python
94 | # Simplified view of how DoEgen reads the setup file (doegen/doegen.py)
95 | import pandas as pd
96 |
97 | def read_setup_simplified(fname_setup):
98 | """Reads the Excel setup file and extracts factor information."""
99 | try:
100 | # Use pandas library to read the Excel file into a table (DataFrame)
101 | df = pd.read_excel(fname_setup)
102 | print(f"Successfully read setup file: {fname_setup}")
103 |
104 | # --- Extract Basic Info ---
105 | # Get lists of names, types, levels, etc. from the table columns
106 | factor_names = df["Parameter Name"].tolist()
107 | level_numbers = df["Level Number"].tolist()
108 | # ... extract other columns like Parameter Type, Min, Max, Levels ...
109 |
110 | # --- Determine Specific Level Values (Simplified Logic) ---
111 | # (Actual code calculates numeric levels based on min/max/count
112 | # and parses categorical levels from the 'Levels' column)
113 | # level_values = calculate_actual_levels(df) # Placeholder
114 |
115 | print(f"Found {len(factor_names)} factors to include:")
116 | print(f" Names: {factor_names}")
117 | print(f" Levels per factor: {level_numbers}")
118 |
119 | # Store this information in a structured way (like the ExperimentalSetup object)
120 | # setup_object = create_setup_object(level_numbers, level_values, factor_names)
121 | # return setup_object # Return the processed setup info
122 |
123 | except FileNotFoundError:
124 | print(f"Error: Setup file not found at {fname_setup}")
125 | except Exception as e:
126 | print(f"Error reading setup file: {e}")
127 |
128 | # Example of how DoEgen might use this function internally:
129 | # experiment_setup = read_setup_simplified("Experiment_setup_template.xlsx")
130 | # if experiment_setup:
131 | # # Now use experiment_setup for Design Generation...
132 | # pass
133 | ```
134 |
135 | This code essentially:
136 | 1. Opens and reads the Excel file specified.
137 | 2. Pulls out the information from each column (Parameter Name, Level Number, etc.).
138 | 3. Processes this raw information to figure out the exact level values for each factor (e.g., calculating `[20, 30, 40]` for Temperature).
139 | 4. Packages this structured information neatly so other parts of `DoEgen` can use it.
140 |
141 | ## The Process Flow
142 |
143 | Here's a simple diagram showing how your Excel file becomes the setup definition inside `DoEgen`:
144 |
145 | ```mermaid
146 | sequenceDiagram
147 | participant U as User
148 | participant DG as DoEgen (Main Script)
149 | participant RSN as read_setup_new() Function
150 | participant P as Pandas Library
151 | participant ESO as ExperimentSetup Object
152 |
153 | U->>DG: Specifies path to 'Experiment_setup.xlsx'
154 | DG->>RSN: Calls read_setup_new() with the path
155 | RSN->>P: Asks Pandas to read the Excel file
156 | P-->>RSN: Returns the data as a table (DataFrame)
157 | RSN->>RSN: Extracts columns (Names, Types, Levels, Min, Max...)
158 | RSN->>RSN: Calculates specific level values (e.g., [20, 30, 40])
159 | RSN->>ESO: Creates an 'ExperimentSetup' object containing all processed info
160 | ESO-->>RSN: Returns the created object
161 | RSN-->>DG: Returns the completed 'ExperimentSetup' object
162 | Note right of DG: DoEgen now has the structured recipe!
163 | ```
164 |
165 | ## Conclusion
166 |
167 | In this chapter, we learned the fundamental concept of the **Experiment Setup Definition**. It's the crucial first step where you precisely define the "ingredients" (factors), their "types" (numeric/categorical), and the specific "settings" (levels) you want to test in your experiment. We saw how to provide this information using a structured Excel template.
168 |
169 | This setup definition acts as the blueprint or recipe that `DoEgen` needs. With this information clearly defined, we are now ready to move on to the next exciting step: actually creating the experimental plan.
170 |
171 | Let's dive into [Chapter 2: Design Generation
172 | ](02_design_generation_.md) to see how `DoEgen` uses this setup to build an efficient experiment schedule!
173 |
174 | ---
175 |
176 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/02_design_generation_.md:
--------------------------------------------------------------------------------
1 | # Chapter 2: Design Generation
2 |
3 | In [Chapter 1: Experiment Setup Definition
4 | ](01_experiment_setup_definition_.md), we learned how to create the "recipe" for our experiment by defining the factors, levels, and types. Now that `DoEgen` knows *what* we want to test, this chapter focuses on *how* to plan the actual sequence of experiments efficiently. This core process is called **Design Generation**.
5 |
6 | ## The Challenge: Too Many Experiments!
7 |
8 | Imagine you want to test different settings for baking that perfect cake from Chapter 1. Let's say you have:
9 | * Temperature: 3 levels (170°C, 180°C, 190°C)
10 | * Sugar Amount: 3 levels (0.8 cup, 1 cup, 1.2 cups)
11 | * Flour Type: 2 levels (White, Whole Wheat)
12 | * Baking Time: 3 levels (25 min, 30 min, 35 min)
13 |
14 | If you wanted to test *every single possible combination* (a "full factorial" design), you'd need to bake:
15 | `3 (Temp) * 3 (Sugar) * 2 (Flour) * 3 (Time) = 54` cakes!
16 |
17 | That's a lot of baking! For more complex experiments with more factors or levels, the number of combinations explodes quickly. This is where **Design Generation** comes in.
18 |
19 | ## The Solution: A Smart Scheduler for Experiments
20 |
21 | **Design Generation** in `DoEgen` is like using a **smart scheduler** to plan your experiments. Instead of running every single combination, it intelligently selects a much smaller, representative set of runs.
22 |
23 | Think about test-driving cars. You want to evaluate different features (engine type, transmission, color, trim level). You *could* test drive every single possible configuration, but that would take forever! A smart scheduler would help you pick the minimum number of diverse test drives needed to get a good feel for all the important features and how they might interact, without driving hundreds of cars.
24 |
25 | `DoEgen` aims to create designs (experimental plans) that are:
26 | 1. **Efficient:** Uses the minimum number of runs possible to get meaningful results.
27 | 2. **Balanced:** Tests each level of each factor roughly the same number of times. (Like making sure you test both 'White' and 'Whole Wheat' flour fairly).
28 | 3. **Near-Orthogonal:** Tries to ensure factors can be evaluated independently. (Ideally, changing the 'Temperature' shouldn't automatically force a change in 'Baking Time' in your plan).
29 |
30 | To achieve this, `DoEgen` cleverly uses the `OApackage` library, which specializes in finding these kinds of optimized experimental plans, often based on mathematical structures called Orthogonal Arrays.
31 |
32 | ## How to Generate a Design with `DoEgen`
33 |
34 | Generating a design involves two main things:
35 | 1. Your **Experiment Setup file** (the Excel file we created in Chapter 1).
36 | 2. A **Settings file** (usually `settings_design.yaml`) that tells `DoEgen` things like:
37 | * Where to find your setup file.
38 | * How many experimental runs you're willing to do (e.g., a minimum and maximum number).
39 | * Where to save the generated designs.
40 | * How much computer time to spend searching for the best design. (We'll cover settings files in detail in [Chapter 8: Configuration Handling
41 | ](08_configuration_handling_.md)).
42 |
43 | Once you have these ready, you run `DoEgen` from your terminal:
44 |
45 | ```bash
46 | # Make sure your setup file (e.g., Experiment_setup.xlsx) is ready
47 | # Make sure your settings file (e.g., settings_design.yaml) is configured
48 |
49 | # Run the design generation module
50 | python -m doegen.doegen settings_design.yaml
51 | ```
52 |
53 | **What does this command do?**
54 | * It tells Python to run the `doegen` module within the `doegen` package.
55 | * It passes the `settings_design.yaml` file as input, which tells `DoEgen` all the specifics for this run.
56 |
57 | **What happens next?**
58 | `DoEgen` will:
59 | 1. Read your setup file (`Experiment_setup.xlsx`).
60 | 2. Read your settings file (`settings_design.yaml`).
61 | 3. Figure out the range of run numbers to explore (e.g., from 12 runs up to 150 runs, in steps of 6, based on your settings).
62 | 4. For *each* run number in that range, it will use `OApackage` to search for an optimized design (a sequence of experiments). This can take some time, especially for larger designs.
63 | 5. It saves each generated design as a simple table (CSV file).
64 | 6. It also calculates some quality scores (efficiencies) for each design and saves those too. (More on this in [Chapter 3: Design Evaluation & Efficiency Metrics
65 | ](03_design_evaluation___efficiency_metrics_.md)).
66 |
67 | **Example Output (Simplified Design Array):**
68 |
69 | After running, `DoEgen` might create files like `EDarray_[3, 3, 2, 3]_Nrun54.csv` in an output folder. This file contains the actual plan. It looks like a table where each row is one experiment run, and each column is a factor:
70 |
71 | ```csv
72 | # Simplified Example: EDarray_..._Nrun12.csv (A design with 12 runs)
73 | # Note: Actual arrays use numbers (0, 1, 2...) to represent levels internally.
74 | # DoEgen later converts these back to your actual values (e.g., 170C, 180C).
75 |
76 | # Factor Levels: Temp(3), Sugar(3), Flour(2), Time(3)
77 | # Levels represented as 0, 1, 2...
78 |
79 | 0,0,0,0 # Run 1: Temp=Level 0, Sugar=Level 0, Flour=Level 0, Time=Level 0
80 | 1,1,0,1 # Run 2: Temp=Level 1, Sugar=Level 1, Flour=Level 0, Time=Level 1
81 | 2,2,0,2 # Run 3: Temp=Level 2, Sugar=Level 2, Flour=Level 0, Time=Level 2
82 | 0,1,1,2 # Run 4: Temp=Level 0, Sugar=Level 1, Flour=Level 1, Time=Level 2
83 | 1,2,1,0 # Run 5: Temp=Level 1, Sugar=Level 2, Flour=Level 1, Time=Level 0
84 | 2,0,1,1 # Run 6: Temp=Level 2, Sugar=Level 0, Flour=Level 1, Time=Level 1
85 | 0,2,0,1 # Run 7: ... and so on ...
86 | 1,0,0,2
87 | 2,1,0,0
88 | 0,0,1,0
89 | 1,1,1,1
90 | 2,2,1,2
91 | ```
92 |
93 | This table is the core output – your optimized schedule of experiments!
94 |
95 | ## What's Happening Under the Hood?
96 |
97 | Let's peek inside `DoEgen` to see the main steps when you run the design generation command:
98 |
99 | 1. **Read Inputs:** `DoEgen` first reads your experiment recipe from the Excel setup file ([Chapter 1: Experiment Setup Definition
100 | ](01_experiment_setup_definition_.md)) and the run parameters from the `settings_design.yaml` file ([Chapter 8: Configuration Handling
101 | ](08_configuration_handling_.md)).
102 | 2. **Determine Run Sizes:** It calculates the range of experiment sizes (number of runs) to investigate, based on your settings (e.g., minimum runs, maximum runs, step size).
103 | 3. **Optimize for Each Size:** For each number of runs (e.g., 12 runs, 18 runs, 24 runs...), it calls the core optimization function.
104 | * This function (`optimize_design`) uses the `OApackage` library. `OApackage` is the "engine" that searches for balanced, near-orthogonal designs with the specified number of runs and factor levels. It tries many possibilities and selects the best ones it finds within the allowed time.
105 | 4. **Evaluate Design Quality:** After `OApackage` proposes a design, `DoEgen` calculates various quality metrics (like balance, orthogonality, D-efficiency) using its `evaluate_design2` function. We'll learn about these metrics in [Chapter 3: Design Evaluation & Efficiency Metrics
106 | ](03_design_evaluation___efficiency_metrics_.md).
107 | 5. **Save Results:** The best design found for that specific run size (e.g., the best 12-run design) and its associated quality scores are saved to files.
108 | 6. **Repeat:** Steps 3-5 are repeated for all the different run sizes you asked `DoEgen` to explore.
109 | 7. **Select & Summarize:** Finally, `DoEgen` analyzes the quality scores across all generated designs and suggests a few "good" options (minimum, optimal, best) based on predefined criteria. This is covered in [Chapter 4: Design Selection
110 | ](04_design_selection_.md).
111 |
112 | Here's a simplified view of the process:
113 |
114 | ```mermaid
115 | sequenceDiagram
116 | participant U as User
117 | participant DG as doegen.py (Main Script)
118 | participant OD as optimize_design() Function
119 | participant OAP as OApackage Library
120 | participant ED as evaluate_design2() Function
121 | participant Files as Output Files
122 |
123 | U->>DG: Runs `python -m doegen.doegen settings.yaml`
124 | DG->>DG: Reads Setup & Settings
125 | DG->>OD: Calls optimize_design() for Run Size N
126 | Note right of OD: Tries to find best N-run design
127 | OD->>OAP: Asks OApackage to generate candidate designs
128 | OAP-->>OD: Returns potential design(s)
129 | OD->>ED: Asks evaluate_design2() to score the design
130 | ED-->>OD: Returns quality metrics (efficiencies)
131 | OD-->>DG: Returns best design found for size N & its scores
132 | DG->>Files: Saves Design Array (CSV) & Efficiencies (CSV)
133 | Note right of DG: Repeats for other run sizes...
134 | Note right of DG: Finally, suggests best designs (Ch 4)
135 |
136 | ```
137 |
138 | ## Diving Deeper into the Code (Simplified View)
139 |
140 | The main script `doegen/doegen.py` orchestrates this process.
141 |
142 | 1. **Reading Setup:** It uses functions like `read_setup_new` (which we saw in Chapter 1) to load your experiment definition.
143 |
144 | 2. **Looping and Optimizing:** The `main` function sets up a loop (or uses multiprocessing via `optimize_design_multi`) to iterate through the desired run sizes (e.g., `nrun_min` to `nrun_max` in steps of `ndelta`). Inside this loop, it calls `optimize_design` for each run size.
145 |
146 | ```python
147 | # Simplified view from doegen/doegen.py - main function logic
148 |
149 | def main(fname_setup, outpath, nrun_max, maxtime_per_run, delta_nrun, nrun_min):
150 | # 1. Read the experiment setup
151 | setup = ExperimentalSetup.read(fname_setup)
152 | print(f"Read setup for {setup.number_of_factors} factors.")
153 |
154 | # 2. Determine the range of run sizes to test
155 | # (Calculates nrun_min if not given, determines step size ndelta)
156 | ndelta = delta_nrun # Simplified
157 | # ... calculation of actual nrun_min ...
158 | xrun = np.arange(nrun_min, nrun_max, ndelta) # e.g., [12, 18, 24, ...]
159 | print(f"Will generate designs for run sizes: {xrun}")
160 |
161 | # 3. Optimize for each run size (potentially in parallel)
162 | all_efficiencies = []
163 | for runsize in xrun:
164 | print(f"--- Optimizing for {runsize} runs ---")
165 | # Call the core optimization function
166 | effs = optimize_design(setup, outpath, maxtime_per_run, ndelta, runsize)
167 | all_efficiencies.append(effs)
168 | # (Actual code might use optimize_design_multi for parallelism)
169 |
170 | # 4. Process results, save summary, suggest designs (See Ch 3 & 4)
171 | # ... save combined efficiencies ...
172 | # ... select minimum, optimal, best designs ...
173 | print("FINISHED Design Generation")
174 | ```
175 |
176 | 3. **Core Optimization (`optimize_design`):** This function is the heart of the generation process. It prepares the inputs for `OApackage` and calls its optimization routine.
177 |
178 | ```python
179 | # Simplified view from doegen/doegen.py - optimize_design function
180 |
181 | import oapackage # The core library for finding designs
182 |
183 | def optimize_design(setup, outpath, runtime, delta, runsize, printopt=True):
184 | """Optimizes design for a specific runsize."""
185 | print(f"Searching for best design with {runsize} runs...")
186 | outpath_nrun = os.path.join(outpath, f"DesignArray_Nrun{runsize}/")
187 |
188 | # Define the problem for OApackage
189 | arrayclass = oapackage.arraydata_t(
190 | setup.factor_levels, runsize, 0, setup.number_of_factors
191 | )
192 |
193 | # Ask OApackage to find good designs (this is the complex part!)
194 | # It tries many random starts and improvements.
195 | # 'alpha' weights different optimization criteria.
196 | alpha = [5, 5, 15] # Example weights
197 | # 'niter' relates to how long it searches (calculated based on 'runtime')
198 | niter = calculate_iterations_based_on_time(runtime) # Simplified placeholder
199 |
200 | scores, _, designs, _ = oapackage.Doptim.Doptimize(
201 | arrayclass, nrestarts=10, niter=niter, optimfunc=alpha, maxtime=runtime
202 | )
203 | print(f"OApackage generated {len(designs)} candidate designs.")
204 |
205 | # Select the best design found by OApackage based on DoEgen's criteria
206 | # (Uses evaluate_design2 to score them - see Ch 3)
207 | best_design_array = find_best_among_candidates(setup, designs) # Simplified
208 |
209 | # Evaluate the final selected design
210 | efficiencies = evaluate_design2(setup, best_design_array, dir_out=outpath_nrun)
211 |
212 | # Save the best design array and its efficiencies
213 | save_design_and_efficiencies(outpath_nrun, setup, runsize, best_design_array, efficiencies)
214 |
215 | return efficiencies # Return the scores for this runsize
216 | ```
217 |
218 | This simplified code shows how `DoEgen` acts as a manager: it sets up the problem, calls the specialized `OApackage` engine to do the heavy lifting of finding candidate designs, evaluates the results using its own criteria ([Chapter 3: Design Evaluation & Efficiency Metrics
219 | ](03_design_evaluation___efficiency_metrics_.md)), and saves the final plan.
220 |
221 | ## Conclusion
222 |
223 | In this chapter, we explored **Design Generation**, the core process where `DoEgen` creates an efficient experimental plan. We learned that instead of testing every single combination (which is often impractical), `DoEgen` acts like a "smart scheduler", using the `OApackage` library to find a smaller set of experiments that are **balanced** and **near-orthogonal**.
224 |
225 | We saw how to initiate this process using the `python -m doegen.doegen` command with a settings file, and what kind of output files (the design arrays) to expect. We also got a glimpse into the internal steps `DoEgen` takes to optimize and evaluate these designs.
226 |
227 | Now that we have generated potential experimental plans, the next crucial step is to understand how "good" these plans actually are. Let's move on to [Chapter 3: Design Evaluation & Efficiency Metrics
228 | ](03_design_evaluation___efficiency_metrics_.md) to learn how `DoEgen` measures the quality of the generated designs.
229 |
230 | ---
231 |
232 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/03_design_evaluation___efficiency_metrics_.md:
--------------------------------------------------------------------------------
1 | # Chapter 3: Design Evaluation & Efficiency Metrics
2 |
3 | In [Chapter 2: Design Generation](02_design_generation_.md), we saw how `DoEgen` acts like a smart scheduler to create potential experimental plans (designs) for different numbers of runs. But how do we know if a generated plan is actually *good*? Just because we have a schedule doesn't mean it's efficient or useful. This chapter explains how `DoEgen` grades these plans using **Design Evaluation & Efficiency Metrics**.
4 |
5 | ## Why Grade Our Experiment Plan?
6 |
7 | Imagine you created several potential test drive schedules (from Chapter 2) for evaluating new cars. Some schedules might have you driving only on highways, others only in the city. Some might make you test the red car way more often than the blue car. Clearly, not all schedules are equally helpful!
8 |
9 | We need a way to **grade** these schedules based on specific criteria:
10 | * Does it test all the features (factors) fairly?
11 | * Does it cover different driving conditions (levels) evenly?
12 | * Does it avoid redundant tests or confusing situations where changing one feature always changes another?
13 |
14 | **Design Evaluation** in `DoEgen` does exactly this for our experimental plans. It assigns numerical scores (metrics) to quantify how well-designed a plan is. This helps us compare different plans (e.g., a 12-run plan vs. an 18-run plan) and choose the one that gives us the most reliable information for the number of experiments we can afford.
15 |
16 | ## Key Grading Criteria: The Efficiency Metrics
17 |
18 | `DoEgen` uses several statistical metrics to "grade" a design. Think of these like different subjects on a report card for your experimental plan. The scores usually range from 0 (worst) to 100 (best or theoretically optimal). Here are the main ones:
19 |
20 | 1. **Level Balance:**
21 | * **Question:** How evenly does the plan use each setting (level) for every factor?
22 | * **Analogy:** In our cake baking plan, does it use 170°C, 180°C, and 190°C roughly the same number of times? Does it test 'White' flour about as often as 'Whole Wheat'?
23 | * **Why it matters:** Ensures fair comparison of all levels. A low score means some settings are under-represented.
24 | * **Score:** 0-100. 100 means perfect balance (each level appears exactly the same number of times, or as close as possible).
25 |
26 | 2. **Orthogonality:**
27 | * **Question:** How independent are the factors in the plan? Can we change one factor's setting without being forced to change another?
28 | * **Analogy:** In the test drive schedule, if every time we test the 'Sport' engine, we *also* have to test the 'Manual' transmission, it's hard to tell if good performance is due to the engine or the transmission. An orthogonal plan avoids this forced pairing.
29 | * **Why it matters:** Allows us to estimate the effect of each factor separately without confusion. Low orthogonality (high correlation between factor columns in the plan) makes analysis difficult.
30 | * **Score:** 0-100. 100 means perfectly orthogonal (all factors are statistically independent in the design).
31 |
32 | 3. **D-Efficiency (D1-Eff, D-Eff, D2-Eff):**
33 | * **Question:** How precisely can we estimate the effects of the factors based on this plan?
34 | * **Analogy:** Think of this as the "sharpness" or "focus" of the picture we'll get about how each factor influences the outcome. A higher D-efficiency means a sharper picture.
35 | * **Why it matters:** A more D-efficient design allows for more accurate conclusions about which factors are important and by how much.
36 | * **Details:**
37 | * `D1-Eff`: Considers only the main effects of each factor (most commonly used for initial screening).
38 | * `D-Eff`: Considers main effects *and* quadratic effects (e.g., effect of Temperature and Temperature-squared).
39 | * `D2-Eff`: Considers main, quadratic, *and* two-way interaction effects (e.g., how Temperature and Sugar Amount interact).
40 | * **Score:** 0-100. 100 is a theoretical maximum. Higher is better, especially for `D1-Eff` in smaller designs.
41 |
42 | 4. **Interaction Balance (Two-way Interaction Balance):**
43 | * **Question:** How evenly does the plan test combinations of *pairs* of factor levels?
44 | * **Analogy:** Does our cake plan test 'White' flour combined with 170°C, 180°C, and 190°C? Does it also test 'Whole Wheat' flour with all three temperatures? Does it do this fairly for *all pairs* of factors?
45 | * **Why it matters:** Helps understand if the effect of one factor changes depending on the level of another factor (interactions).
46 | * **Score:** 0-100. 100 means all pairs of levels across all pairs of factors are tested equally often (or as close as possible).
47 | * **Related Metric:** *Two-way Interaction with at least one occurrence*: Checks if *every* possible pair combination appears at least once. Score 100 means yes, lower means some combinations are missing entirely.
48 |
49 | `DoEgen` also calculates other metrics like **Center Balance** (related to how well centered the design is for numeric factors) and **A-Efficiencies** (another measure related to estimation precision), giving a comprehensive evaluation.
50 |
51 | ## How `DoEgen` Performs the Evaluation
52 |
53 | The good news is: you don't usually need to run a separate command for evaluation! When you run the [Design Generation](02_design_generation_.md) process (`python -m doegen.doegen settings_design.yaml`), `DoEgen` automatically evaluates *every* design it generates for each run size.
54 |
55 | Inside `DoEgen`, a function called `evaluate_design2` (located in the `doegen/doegen.py` file) takes the generated design array (the table of 0s, 1s, 2s...) and calculates all these efficiency scores.
56 |
57 | ## Understanding the Evaluation Results
58 |
59 | After running the design generation, `DoEgen` saves the results of the evaluation in a few places within your output directory (specified in `settings_design.yaml`):
60 |
61 | 1. **Individual Efficiency Files:** For each run size (e.g., 12 runs), inside its specific subfolder (e.g., `DesignArray_Nrun12/`), you'll find:
62 | * `Efficiencies_[factor_levels]_Nrun12.csv`: A file containing the calculated scores (Level Balance, Orthogonality, D1-Eff, etc.) for the best 12-run design found.
63 | * `Table_Pearson_Correlation.csv`: Shows the pairwise correlation between factors (related to Orthogonality). Low values (near 0) are good.
64 | * `Table_Interaction_Balance.txt`: Details about the balance of pairwise combinations.
65 | * `pairwise_correlation.png`: A plot visualizing the relationships and balance within the design (see image below).
66 |
67 | 2. **Combined Efficiency File:** In the main output directory, you'll find:
68 | * `Efficiencies_[factor_levels]_all3.csv`: A summary table listing the key efficiency scores for *all* the run sizes tested (e.g., 12, 18, 24...). This is very useful for comparing designs.
69 |
70 | *Example Snippet (`Efficiencies_..._all3.csv`):*
71 | ```csv
72 | Center Balance,Level Balance,Orthogonality,Two-level Balance,Two-level Min-Eff,D-Eff,D1-Eff,D2-Eff,A-Eff,A1-Eff,A2-Eff,Nexp
73 | 96.5,97.2,91.3,88.5,95.0,15.1,75.3,8.2,10.5,60.1,5.1,12
74 | 98.1,98.5,94.6,92.1,100.0,18.9,85.7,10.1,12.8,72.5,6.8,18
75 | 99.2,99.0,97.8,96.3,100.0,25.6,92.1,14.5,18.3,81.0,9.9,24
76 | ... (more rows for other run sizes) ...
77 | ```
78 | *This table lets you see how scores improve (or plateau) as you increase the number of experiments (`Nexp`).*
79 |
80 | 3. **Efficiency Plot:** Also in the main output directory:
81 | * `Efficiencies_[factor_levels].png`: A plot showing how key efficiencies change with the number of runs.
82 |
83 | {width=400}
84 |
85 | * **How to read the plot:** The X-axis is the number of experiments (runs). The Y-axis is the efficiency score (0-100). Each colored line represents a different metric. You generally want designs where the lines for key metrics (like Level Balance, Orthogonality, Two-level Min-Eff, D1-Eff) are high (close to 100). This plot helps visualize the trade-off: often, scores improve rapidly at first and then level off. You might choose a run size where the scores are acceptably high but before the lines become flat (diminishing returns).
86 |
87 | 4. **Pairwise Correlation Plot:** For each individual design (e.g., in `DesignArray_Nrun12/`):
88 | * `pairwise_correlation.png`: Shows scatter plots for each pair of factors in the design.
89 |
90 | {width=600}
91 |
92 | * **How to read the plot:**
93 | * **Diagonal:** Histograms showing how often each level was used for that factor. Flat histograms indicate good Level Balance.
94 | * **Off-Diagonal:** Scatter plots showing the combinations tested for pairs of factors. If the points fill the space somewhat evenly and the regression line (blue line) is mostly flat (horizontal), it indicates good Orthogonality between those two factors. Steep lines indicate correlation (bad for orthogonality).
95 |
96 | ## What's Happening Inside `evaluate_design2`?
97 |
98 | Let's peek under the hood to see the basic steps the `evaluate_design2` function takes when it receives a design array:
99 |
100 | ```mermaid
101 | sequenceDiagram
102 | participant OD as optimize_design() (from Ch 2)
103 | participant ED2 as evaluate_design2() Function
104 | participant Array as Design Array (Input)
105 | participant Calcs as Internal Calculations (Numpy, Pandas)
106 | participant Files as Output Files (.csv, .png)
107 |
108 | OD->>ED2: Calls evaluate_design2() with a generated Array
109 | ED2->>Array: Receives the numerical design array (0s, 1s, ...)
110 | ED2->>Calcs: Calculates Level Balance (counts levels per column)
111 | ED2->>Calcs: Normalizes Array (scales values, e.g., to -1, 1)
112 | ED2->>Calcs: Calculates Orthogonality (via Correlation matrix of normalized array)
113 | ED2->>Calcs: Creates Model Matrix X (using `create_model`)
114 | ED2->>Calcs: Calculates D-Efficiencies (Determinant of X^T*X)
115 | ED2->>Calcs: Calculates A-Efficiencies (Trace of inverse of X^T*X)
116 | ED2->>Calcs: Calculates Interaction Balance (counts pairs of levels across pairs of columns)
117 | Calcs-->>ED2: Returns calculated scores
118 | ED2->>Files: Saves detailed tables (Correlation, Interaction Balance)
119 | ED2->>Files: Saves Pairwise Correlation Plot (using Matplotlib/Seaborn)
120 | ED2-->>OD: Returns main efficiency scores (e.g., a list or tuple)
121 | ```
122 |
123 | **Simplified Code View (`doegen/doegen.py`):**
124 |
125 | The `evaluate_design2` function uses libraries like `numpy` and `pandas` to perform these calculations. Here are highly simplified examples of the logic:
126 |
127 | ```python
128 | # Simplified view inside doegen/doegen.py - evaluate_design2 function
129 |
130 | import numpy as np
131 | import pandas as pd
132 | # ... other imports like itertools, matplotlib, seaborn ...
133 |
134 | def normalize_array(Array):
135 | """Scales array columns, e.g., from 0,1,2 to -1, 0, 1."""
136 | # Simplified: Actual code handles different ranges properly
137 | colmax = np.max(Array, axis=0)
138 | colmin = np.min(Array, axis=0)
139 | # Avoid division by zero if a factor has only one level tested
140 | coldelta = np.where(colmax > colmin, colmax - colmin, 1)
141 | colmean = (colmax + colmin) / 2.0
142 | return 2 * (Array - colmean) / coldelta
143 |
144 | def calc_Deff(X):
145 | """Calculates D-efficiency from model matrix X."""
146 | # D-eff relates to the determinant of the 'information matrix' (X^T * X)
147 | XX = np.dot(X.T, X)
148 | try:
149 | # Use slogdet for numerical stability
150 | _sign, logdet = np.linalg.slogdet(XX)
151 | # Geometric mean of eigenvalues, scaled
152 | det = np.exp(logdet / X.shape[1]) if _sign > 0 else 0
153 | except np.linalg.LinAlgError:
154 | det = 0 # Matrix might be singular (bad design)
155 | return 100 * det / X.shape[0] # Scaled score 0-100
156 |
157 | # def create_model(Array, mode=1): ... # Creates model matrix (see Ch 2)
158 |
159 | def evaluate_design2(setup, Array, printopt=False, dir_out=None, plotgrid=True):
160 | """Calculates various efficiency metrics for a design Array."""
161 | runsize, number_of_factors = Array.shape
162 | fac_levels = setup.factor_levels # e.g., [3, 3, 2, 3]
163 |
164 | # --- Level Balance Calculation (Simplified) ---
165 | sum_imbalance = 0.0
166 | for col_idx, nlevel in enumerate(fac_levels):
167 | column = Array[:, col_idx].astype(int)
168 | ideal_count_per_level = runsize / nlevel
169 | counts = np.bincount(column, minlength=nlevel) # Count occurrences of 0, 1, ...
170 | imbalance = np.sum(np.abs(counts - ideal_count_per_level))
171 | sum_imbalance += imbalance
172 | # Scale imbalance relative to total size, convert to % efficiency
173 | leveleff = 100 * (1 - sum_imbalance / (2 * (runsize - runsize / np.mean(fac_levels)))) # Formula nuance
174 |
175 | # --- Orthogonality (via Pearson Correlation) ---
176 | Anorm = normalize_array(Array) # Scale to -1 to 1 for fair correlation
177 | Acor_pearson = np.corrcoef(Anorm.T) # Calculate correlation matrix
178 | # Orthogonality relates to how close correlations are to zero
179 | # A simple (though not exact) proxy could involve off-diagonal sums
180 | ortho_measure = np.sum(np.abs(np.triu(Acor_pearson, k=1))) # Sum absolute off-diagonal correlations
181 | # Convert to 0-100 scale (lower correlation sum is better)
182 | orthoeff = 100 * (1 - ortho_measure / (number_of_factors * (number_of_factors - 1) / 2)) # Simplified scaling
183 |
184 | # --- D-Efficiency (Example for Main Effects D1-Eff) ---
185 | X1, _ = create_model(Anorm, mode=1, norm=False) # Model with main effects
186 | D1eff = calc_Deff(X1)
187 |
188 | # --- Interaction Balance (Conceptual) ---
189 | # Uses calc_twofactorbalance() internally
190 | # Involves iterating through pairs of columns (factors)
191 | # For each pair, count occurrences of level combinations (e.g., how often Factor A=0 and Factor B=1 occurs)
192 | # Compare counts to the ideal count (runsize / (levels_A * levels_B))
193 | # Sum up deviations to get imbalance score, convert to % efficiency
194 | twoleveleff, twolevelmin, _ = calc_twofactorbalance(setup, Array)
195 | twoleveleff, twolevelmin = 100 * twoleveleff, 100 * twolevelmin
196 |
197 | # ... calculate other metrics (Center Balance, A-Eff, D-Eff, D2-Eff) ...
198 |
199 | # --- Save outputs if dir_out is provided ---
200 | if dir_out is not None:
201 | # Save correlation tables, interaction balance tables, plots...
202 | # (Code uses pandas DataFrames .to_csv() and matplotlib/seaborn .savefig())
203 | pass
204 |
205 | # --- Return the main scores ---
206 | efficiencies = (
207 | # centereff, # Calculated earlier
208 | leveleff,
209 | orthoeff,
210 | twoleveleff,
211 | twolevelmin,
212 | # Deff, D1eff, D2eff, # Calculated
213 | # Aeff, A1eff, A2eff, # Calculated
214 | # ... other scores ...
215 | ) # Actual function returns a specific tuple of ~11 scores
216 | # Simplified return for clarity:
217 | return (leveleff, orthoeff, D1eff, twoleveleff, twolevelmin) # Example subset
218 | ```
219 |
220 | This code takes the design array, performs calculations (like counting, normalizing, computing correlations, building models, finding determinants), saves detailed diagnostics, and finally returns the key efficiency scores.
221 |
222 | ## Conclusion
223 |
224 | In this chapter, we learned about **Design Evaluation & Efficiency Metrics**. We saw that `DoEgen` doesn't just generate experiment plans; it also grades them using metrics like **Level Balance**, **Orthogonality**, **D-Efficiency**, and **Interaction Balance**. These scores, typically ranging from 0 to 100, tell us how "good" a design is in terms of fairness, independence, precision, and coverage of combinations.
225 |
226 | We learned that this evaluation happens automatically during the [Design Generation](02_design_generation_.md) step, producing `.csv` files and plots that summarize the efficiencies for different run sizes. Understanding these metrics and plots is crucial for making an informed decision about which experimental plan to actually use.
227 |
228 | Now that we know how to generate designs (Chapter 2) and how to evaluate their quality (Chapter 3), the next logical step is to use these evaluations to pick the best design for our specific needs and budget. Let's move on to [Chapter 4: Design Selection](04_design_selection_.md) to see how `DoEgen` helps us with this final step in planning our experiment.
229 |
230 | ---
231 |
232 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/04_design_selection_.md:
--------------------------------------------------------------------------------
1 | # Chapter 4: Design Selection
2 |
3 | In the previous chapter, [Chapter 3: Design Evaluation & Efficiency Metrics](03_design_evaluation___efficiency_metrics_.md), we learned how `DoEgen` grades the experimental plans (designs) it creates using scores like Level Balance, Orthogonality, and D-Efficiency. We generated and evaluated designs for various numbers of experimental runs (e.g., 12 runs, 18 runs, 24 runs, etc.).
4 |
5 | But now we have a bunch of potential plans, each with its own report card (efficiency scores). How do we pick the *right* one to actually use for our experiment? This is where **Design Selection** comes in!
6 |
7 | ## The Challenge: Choosing the Best Plan for You
8 |
9 | Imagine you're planning a trip. A travel agent (like `DoEgen`'s design generation) might show you several itineraries:
10 | * A quick, bare-bones trip (fewest days, covers just the essentials).
11 | * A well-rounded trip (moderate length, good mix of sights and relaxation).
12 | * A long, comprehensive trip (many days, sees absolutely everything).
13 |
14 | You wouldn't just pick one randomly! You'd consider your budget, how much time you have, and what's most important to you.
15 |
16 | Similarly, after `DoEgen` generates and evaluates designs with different numbers of runs (12, 18, 24...), we need to choose the one that best fits our experimental "budget" (how many runs we can afford) and our "goals" (how much detail and reliability we need). Doing this manually by comparing all the efficiency scores for all the run sizes can be tedious.
17 |
18 | ## The Solution: Automatic Recommendations
19 |
20 | `DoEgen` makes this easier by automatically suggesting a few good candidate designs based on the efficiency scores it calculated in Chapter 3. Think of it like the **travel agent highlighting three recommended options**:
21 |
22 | 1. **Minimum Design:** This is like the **cheapest valid plan**. It's the design with the *fewest* experimental runs that still meets some basic quality standards (e.g., good enough balance and orthogonality, covers essential combinations). It's suitable if you're on a very tight budget or just doing an initial screening.
23 |
24 | 2. **Optimal Design:** This is like the **best value-for-money plan**. It aims to find a sweet spot, balancing high quality (good efficiency scores) with a reasonable number of runs. It often requires more runs than the minimum, but the improvement in quality is usually worth the extra effort.
25 |
26 | 3. **Best Design:** This is like the **most comprehensive plan**. It's the design that achieves the highest overall quality score among all the generated options, even if it means doing quite a few more experiments. This is for situations where getting the absolute highest quality data is the top priority, and the number of runs is less of a constraint.
27 |
28 | This automatic selection process helps you quickly narrow down the choices to a few sensible options.
29 |
30 | ## How `DoEgen` Selects the Designs
31 |
32 | Good news! Design selection happens **automatically** at the very end of the [Design Generation](02_design_generation_.md) process. When you run:
33 |
34 | ```bash
35 | python -m doegen.doegen settings_design.yaml
36 | ```
37 |
38 | After generating and evaluating designs for all requested run sizes (e.g., 12, 18, 24...), `DoEgen` performs one final step: it analyzes the collected efficiency data (specifically, the information stored in the `Efficiencies_[factor_levels]_all3.csv` file we saw in Chapter 3) and applies a set of rules to pick the Minimum, Optimal, and Best designs.
39 |
40 | ## The Selection Rules (Simplified View)
41 |
42 | `DoEgen` uses specific thresholds based on the efficiency metrics from Chapter 3. Here's a simplified idea of the rules (the exact percentages can be found in the `DoEgen` documentation, like `MANUAL.md` or `README.md`):
43 |
44 | 1. **Minimum Design Criteria:**
45 | * **Goal:** Find the *smallest* design that's basically sound.
46 | * **Rules:**
47 | * Must have enough runs (usually, number of runs `Nexp >= number of factors + 1`).
48 | * Must have good **Level Balance** (e.g., > 95%).
49 | * Must have good **Orthogonality** (e.g., > 90%).
50 | * Must test every pair of factor levels at least once (**Two-level Min-Eff** = 100%).
51 | * `DoEgen` looks through the evaluated designs, starting from the smallest run size, and picks the *first one* that meets all these conditions.
52 |
53 | 2. **Optimal Design Criteria:**
54 | * **Goal:** Find the best balance between quality and run count among designs meeting stricter criteria.
55 | * **Rules:**
56 | * Must meet even *higher* thresholds for **Level Balance** (e.g., > 98%) and **Orthogonality** (e.g., > 95%).
57 | * Must also meet the **Two-level Min-Eff** = 100% requirement.
58 | * Among the designs meeting these stricter rules, `DoEgen` often calculates a score that rewards high efficiency but adds a small penalty for increasing the number of runs significantly beyond the 'Minimum' design. It picks the design that maximizes this "value" score.
59 |
60 | 3. **Best Design Criteria:**
61 | * **Goal:** Find the design with the absolute highest overall quality score.
62 | * **Rules:**
63 | * `DoEgen` calculates an overall score for *all* generated designs. This score typically sums up key efficiencies (like Level Balance, Orthogonality, D1-Efficiency) and might include a small penalty based on the run size (to slightly favor smaller designs if scores are almost identical).
64 | * It simply picks the design with the highest calculated score, regardless of whether it's much larger than the Minimum or Optimal.
65 |
66 | ## Where to Find the Recommendations
67 |
68 | Once the `doegen.doegen` script finishes, you'll find the selection results in your specified output directory:
69 |
70 | 1. **Summary Text File:**
71 | * `Experiment_Design_selection_summary.txt`
72 | * This file gives you a clear, easy-to-read summary listing the chosen Minimum, Optimal, and Best designs, their run sizes (`Nexp`), and their key efficiency scores. This is usually the first place to look!
73 |
74 | *Example Snippet (`Experiment_Design_selection_summary.txt`):*
75 | ```text
76 | RESULTS OVERVIEW:
77 | --------------------------------
78 | Minimum Exp Design Runsize: 30
79 | Optimal Exp Design Runsize: 72
80 | Best Exp Design Runsize: 90
81 | --------------------------------
82 |
83 |
84 | Efficiencies:
85 | ------------------------------------------------------------------------------
86 | Minimum Design Optimal Design Best Design
87 | Center Balance 96.800 99.500 99.600
88 | Level Balance 97.500 99.100 99.200
89 | Orthogonality 92.300 98.200 98.500
90 | Two-Way Interact Bal 91.800 97.100 97.800
91 | D Efficieny 19.500 35.800 38.200
92 | D1 Efficieny 88.100 96.400 97.100
93 | ```
94 |
95 | 2. **Ready-to-Use Design Tables:**
96 | * `Designtable_minimum_NrunXX.csv` (e.g., `Designtable_minimum_Nrun30.csv`)
97 | * `Designtable_optimal_NrunYY.csv` (e.g., `Designtable_optimal_Nrun72.csv`)
98 | * `Designtable_best_NrunZZ.csv` (e.g., `Designtable_best_Nrun90.csv`)
99 | * These `.csv` files contain the actual experimental plans for the selected designs. Unlike the raw `EDarray...csv` files (which use numbers like 0, 1, 2), these tables show the real factor names and the actual level values (e.g., '180C', 'Catalyst X') you defined in your setup. They are ready for you to use to run your experiments!
100 |
101 | *Example Snippet (`Designtable_optimal_Nrun72.csv`):*
102 | ```csv
103 | Nexp,Temperature,Pressure,Catalyst,Speed
104 | 1,20,1,Catalyst X,100
105 | 2,30,5,Catalyst X,200
106 | 3,40,1,Catalyst X,300
107 | 4,20,5,Catalyst Y,300
108 | 5,30,1,Catalyst Y,100
109 | ... (72 rows total) ...
110 | ```
111 |
112 | 3. **Efficiency Plot:**
113 | * `Efficiencies_[factor_levels].png` (Generated in Chapter 3, but useful here)
114 | * Looking back at this plot helps you visually understand the trade-offs. You can see where adding more runs gives diminishing returns (the curves flatten out), which might help you decide if the 'Optimal' or 'Best' design is worth the extra runs compared to the 'Minimum'.
115 |
116 | ## What Happens Under the Hood?
117 |
118 | The selection logic happens within the `main` function of `doegen/doegen.py` *after* all designs have been generated and their efficiencies calculated and stored in a combined array (let's call it `effs_array`).
119 |
120 | **Process Flow:**
121 |
122 | ```mermaid
123 | sequenceDiagram
124 | participant DG as DoEgen Main Script (doegen.py)
125 | participant OptLoop as Optimization Loop (Ch 2 & 3)
126 | participant EffArray as Combined Efficiency Data (effs_array)
127 | participant Rules as Selection Logic
128 | participant Output as Output Files (.txt, .csv)
129 |
130 | DG->>OptLoop: Generate & Evaluate designs for N=12, 18, 24...
131 | OptLoop-->>EffArray: Store all efficiency scores
132 | Note right of EffArray: Contains scores for all run sizes
133 |
134 | DG->>EffArray: Access the combined efficiency data
135 | DG->>Rules: Apply 'Minimum' criteria thresholds
136 | Rules-->>DG: Identify runsize for Minimum design (e.g., 30)
137 | DG->>Rules: Apply 'Optimal' criteria thresholds & scoring
138 | Rules-->>DG: Identify runsize for Optimal design (e.g., 72)
139 | DG->>Rules: Apply 'Best' scoring logic
140 | Rules-->>DG: Identify runsize for Best design (e.g., 90)
141 |
142 | DG->>Output: Write Experiment_Design_selection_summary.txt
143 | DG->>Output: Call array2valuetable() to create Designtable_minimum_Nrun30.csv
144 | DG->>Output: Call array2valuetable() to create Designtable_optimal_Nrun72.csv
145 | DG->>Output: Call array2valuetable() to create Designtable_best_Nrun90.csv
146 |
147 | ```
148 |
149 | **Simplified Code View (`doegen/doegen.py` - near the end of `main` function):**
150 |
151 | ```python
152 | # Simplified view from doegen/doegen.py - main function logic (after optimization loop)
153 |
154 | # Assume effs_array is a NumPy array where rows are run sizes and columns are efficiencies
155 | # Assume xrun is a NumPy array with the corresponding run sizes (e.g., [12, 18, 24, ...])
156 |
157 | def main(...):
158 | # ... (Code from Chapter 2 & 3: read setup, loop through run sizes, optimize, evaluate) ...
159 | # multi_effs = optimize_design_multi(...) # Collects efficiencies
160 | # effs_array = ... # Convert multi_effs into the array
161 |
162 | # ... (Save combined efficiencies plot and CSV as shown in Chapter 3) ...
163 |
164 | ###### Identify minimum, optimal, and best runsize ######
165 | print("Finding minimum, optimal and best designs...")
166 | Result = namedtuple("Result", ["name", "runsize", "effs"])
167 | results = {} # Dictionary to store the selected designs
168 |
169 | # --- Find Minimum Design ---
170 | # Apply thresholds using np.where: find indices where conditions are met
171 | min_thresholds_met = np.where(
172 | (effs_array[:, 0] >= 95) # Col 0: Center Balance
173 | & (effs_array[:, 1] >= 95) # Col 1: Level Balance
174 | & (effs_array[:, 2] >= 90) # Col 2: Orthogonality
175 | & (effs_array[:, 4] == 100) # Col 4: Two-level Min-Eff
176 | # & (xrun >= setup.number_of_factors + 1) # Implicitly handled by nrun_min usually
177 | )[0] # Get the indices that satisfy the conditions
178 |
179 | if len(min_thresholds_met) > 0:
180 | idx_min = min_thresholds_met[0] # Pick the first index (lowest run size)
181 | results["min"] = Result("minimum", xrun[idx_min], effs_array[idx_min])
182 | print(f" Minimum design found: {results['min'].runsize} runs")
183 | else:
184 | print(" Warning: Could not find a design meeting minimum criteria.")
185 |
186 | # --- Find Optimal Design ---
187 | # Apply stricter thresholds
188 | opt_thresholds_met = np.where(
189 | (effs_array[:, 0] >= 98)
190 | & (effs_array[:, 1] >= 98)
191 | & (effs_array[:, 2] >= 95)
192 | # & (effs_array[:, 3] >= 95) # Col 3: Two-level Balance
193 | & (effs_array[:, 4] == 100)
194 | )[0]
195 |
196 | if len(opt_thresholds_met) > 0 and "min" in results:
197 | # Calculate a score for designs meeting optimal criteria
198 | # Score rewards efficiency, penalizes extra runs vs minimum
199 | runs_sel = xrun[opt_thresholds_met]
200 | score = (
201 | effs_array[opt_thresholds_met, 0] # Center Bal
202 | + effs_array[opt_thresholds_met, 2] # Ortho
203 | + effs_array[opt_thresholds_met, 3] # 2-Way Bal
204 | + 0.5 * effs_array[opt_thresholds_met, 6] # D1-Eff (Col 6)
205 | - (4.0 / results["min"].runsize) * runs_sel # Penalty for run size
206 | )
207 | idx_opt_relative = np.argmax(score) # Find index with max score *within the selection*
208 | idx_opt_absolute = opt_thresholds_met[idx_opt_relative] # Get original index
209 | results["opt"] = Result("optimal", xrun[idx_opt_absolute], effs_array[idx_opt_absolute])
210 | print(f" Optimal design found: {results['opt'].runsize} runs")
211 | else:
212 | print(" Warning: Could not find a design meeting optimal criteria.")
213 |
214 | # --- Find Best Design ---
215 | # Calculate score based on overall quality, slight penalty for size
216 | score_best = (
217 | effs_array[:, 0] # Center Bal
218 | + effs_array[:, 2] # Ortho
219 | + effs_array[:, 3] # 2-Way Bal
220 | + (100 * (effs_array[:, 4] - 100)) # Heavy penalty if MinEff != 100
221 | + 0.5 * effs_array[:, 6] # D1-Eff
222 | - (1.0 / nrun_max) * xrun # Small penalty for run size
223 | )
224 | idx_best = np.argmax(score_best) # Find index with highest score overall
225 | results["best"] = Result("best", xrun[idx_best], effs_array[idx_best])
226 | print(f" Best design found: {results['best'].runsize} runs")
227 |
228 |
229 | # --- Generate Output Files ---
230 | print("Saving minimum, optimal, and best design as experiment design tables...")
231 | # (Code to write the summary text file)
232 | print_designselection_summary(results, fname_out=os.path.join(outpath, "Experiment_Design_selection_summary.txt"))
233 |
234 | # Loop through selected results and create the final tables
235 | for result in results.values():
236 | # Construct paths to the raw array file and the output table file
237 | fname_array = os.path.join(outpath, f"DesignArray_Nrun{result.runsize}", f"EDarray_{setup.factor_levels}_Nrun{result.runsize}.csv")
238 | fname_out = os.path.join(outpath, f"Designtable_{result.name}_Nrun{result.runsize}.csv")
239 | # Call the function to convert the raw array to a user-friendly table
240 | array2valuetable(setup, fname_array, fname_out)
241 | # (Optional: Code to append non-varied factors if any)
242 |
243 | print("\nFINISHED Design Selection")
244 | ```
245 |
246 | This code snippet shows how `DoEgen` uses NumPy's array filtering (`np.where`) and calculations (`np.argmax`) to apply the selection rules and identify the indices corresponding to the Minimum, Optimal, and Best designs within the `effs_array`. Finally, it calls `array2valuetable` to create the human-readable `.csv` files for these selected designs.
247 |
248 | ## Remember: They are Suggestions!
249 |
250 | While the Minimum, Optimal, and Best suggestions are very helpful starting points, they are based on general rules. You are the expert on your experiment!
251 |
252 | * **Check the Summary and Plot:** Always look at the `Experiment_Design_selection_summary.txt` and the `Efficiencies...png` plot.
253 | * **Consider Your Constraints:** Maybe the 'Optimal' design suggests 72 runs, but your budget strictly limits you to 50. In that case, you might look at the efficiency plot and the `Efficiencies_..._all3.csv` file to find the best design available at or below 50 runs (perhaps the 48-run design).
254 | * **Manual Selection:** If you decide to use a design different from the suggested ones (e.g., you want the 48-run design), you can easily create its user-friendly table yourself. Find the corresponding raw array file (`EDarray_..._Nrun48.csv`) in its subfolder (`DesignArray_Nrun48/`) and use the `array2valuetable` function (or simply adapt the code snippet above) to convert it.
255 |
256 | ## Conclusion
257 |
258 | In this chapter, we learned about **Design Selection**, the helpful feature in `DoEgen` that automatically recommends candidate experimental plans after generation and evaluation. By suggesting a **Minimum** (cheapest valid), **Optimal** (best value), and **Best** (highest quality) design, it simplifies the process of choosing a final plan from the many options generated.
259 |
260 | We saw how this selection is based on predefined efficiency criteria applied to the results from [Chapter 3: Design Evaluation & Efficiency Metrics](03_design_evaluation___efficiency_metrics_.md), and where to find the recommendations (`Experiment_Design_selection_summary.txt`) and the ready-to-use plans (`Designtable_....csv`).
261 |
262 | With a well-evaluated and selected experimental plan in hand, we are finally ready to perform the actual experiments! The next step is to gather the results from these experiments and bring them back into `DoEgen`.
263 |
264 | Let's move on to [Chapter 5: Experiment Result Input & Merging](05_experiment_result_input___merging_.md) to see how we manage the data coming back from our experimental runs.
265 |
266 | ---
267 |
268 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/05_experiment_result_input___merging_.md:
--------------------------------------------------------------------------------
1 | # Chapter 5: Experiment Result Input & Merging
2 |
3 | In [Chapter 4: Design Selection](04_design_selection_.md), we saw how `DoEgen` helps us choose the best experimental plan (like the `Designtable_optimal_Nrun72.csv` file) based on efficiency metrics. Now comes the exciting part: you've actually *run* those experiments! Maybe you baked the cakes, grew the plants, or ran the simulations according to the plan.
4 |
5 | Now you have the results – how tasty was each cake? How tall did each plant grow? What was the output of each simulation? This chapter is all about taking those real-world results and getting them ready to be analyzed by `DoEgen`.
6 |
7 | ## The Goal: Matching Results to the Plan
8 |
9 | Imagine you followed the test drive schedule from Chapter 4. For each drive (each row in your `Designtable_...csv`), you recorded the actual fuel efficiency (Miles Per Gallon or Liters per 100km).
10 |
11 | Now you have two pieces of information:
12 | 1. **The Plan:** Which car settings (engine, transmission, etc.) were used for each specific test drive (e.g., Drive #1, Drive #2...). This is in your `Designtable_...csv` file.
13 | 2. **The Results:** The actual fuel efficiency you measured for each test drive (e.g., Drive #1 got 25 MPG, Drive #2 got 30 MPG...). This might be scribbled in a notebook or typed somewhere else.
14 |
15 | The goal of **Experiment Result Input & Merging** is to **combine these two pieces of information systematically**. We need to match the result (e.g., 25 MPG) back to the exact conditions that produced it (Drive #1: specific engine, transmission, etc.).
16 |
17 | `DoEgen` needs this combined information to figure out how the different factors (engine, transmission) influenced the outcome (fuel efficiency).
18 |
19 | ## The Tool: The Experiment Results Excel Template
20 |
21 | Just like we used an Excel template to define our experiment setup in [Chapter 1: Experiment Setup Definition
22 | ](01_experiment_setup_definition_.md), `DoEgen` uses another Excel template to collect your experimental results. This ensures the results are in a structured format that `DoEgen` can easily understand and merge with the original design.
23 |
24 | You can create a blank template file using a helper script included with `DoEgen`:
25 |
26 | ```bash
27 | # Run this command in your terminal in the DoEgen project directory
28 | python -m doegen.create_resultfile
29 | ```
30 |
31 | This command creates an Excel file named `Experiment_results_template.xlsx`.
32 |
33 | Let's look at the key columns in this template:
34 |
35 | *(Based on the image from `MANUAL.md`)*
36 | {width=600}
37 |
38 | | Column Header | Description | Example Value | Why it's Important |
39 | | :------------ | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------- | :----------------------------------------------------- |
40 | | `Nexp` | **Experiment Run Number.** This *must* match the `Nexp` number from your `Designtable_....csv` file. | `1`, `2`, ... | **Crucial!** This is how results are linked to factors. |
41 | | `PID` | **Point ID (Optional).** Use this if you measure results at multiple locations or times *within* the same experiment run (e.g., different sensors, different time points). | `SensorA`, `1` | Allows for more detailed, repeated measures analysis. |
42 | | `Y Label` | **Result Name (Optional).** Use this if you measure multiple *different types* of results for each run (e.g., 'Taste Score' and 'Baking Time' for the cake). | `Taste`, `MPG` | Allows analysis of multiple outcome variables. |
43 | | `Y Exp` | **The Actual Measured Result.** This is the outcome value you observed for the specific run (`Nexp`), point (`PID`), and result type (`Y Label`). | `8.5`, `25.3` | **The core data!** This is what you measured. |
44 | | `Y Truth` | **True/Target Value (Optional).** If you know the 'correct' or expected value (e.g., in simulations or calibration), enter it here. | `9.0`, `26.0` | Used for calculating accuracy (like RMSE) in analysis. |
45 | | `Std Y Exp` | Standard Deviation of `Y Exp` (Optional). | `0.2` | For advanced analysis considering measurement noise. |
46 | | `Std Y Truth` | Standard Deviation of `Y Truth` (Optional). | `0.1` | For advanced analysis. |
47 | | `Weight PID` | Weight for this specific point (Optional). | `1.0` | For advanced weighted analysis. |
48 |
49 | **The most important columns for basic use are `Nexp` and `Y Exp`.**
50 |
51 | ## Filling Out the Results Template
52 |
53 | Let's say you used the `Designtable_optimal_Nrun72.csv` from Chapter 4 for your experiment. It had 72 runs (`Nexp` from 1 to 72). You measured one result, let's call it 'Yield'. You only measured it once per run (so `PID` and `Y Label` can be simple, like `1`).
54 |
55 | Here’s how you might start filling out `Experiment_results_template.xlsx`:
56 |
57 | | Nexp | PID | Y Label | Y Exp | Y Truth | Std Y Exp | Std Y Truth | Weight PID |
58 | | :--- | :-: | :------ | :---- | :------ | :-------- | :---------- | :--------- |
59 | | 1 | 1 | Yield | 85.2 | | | | |
60 | | 2 | 1 | Yield | 91.5 | | | | |
61 | | 3 | 1 | Yield | 88.0 | | | | |
62 | | ... | ... | ... | ... | ... | ... | ... | ... |
63 | | 72 | 1 | Yield | 93.1 | | | | |
64 |
65 | *You would fill in the actual `Y Exp` value you measured for each of the 72 runs.* If you had multiple PIDs or Y Labels, you would have more rows. For example, if run `Nexp=1` had `PID=SensorA` and `PID=SensorB`, you'd have two rows for `Nexp=1`.
66 |
67 | **Key Point:** The `Nexp` column in your results file is the bridge connecting your measured `Y Exp` back to the specific factor settings used in that run, which are listed in the corresponding `Nexp` row of your `Designtable_....csv` file.
68 |
69 | ## How `DoEgen` Merges the Data (Internal View)
70 |
71 | You don't usually run a separate command just for merging. The merging happens *inside* the next step: the result analysis module ([Chapter 6: Result Analysis & Statistics
72 | ](06_result_analysis___statistics_.md)). When you run the analysis script (`doegen.doeval`), one of the first things it does is read both your design table and your results file and combine them.
73 |
74 | **Step-by-step merging process:**
75 |
76 | 1. **Read Design:** The analysis module reads the selected design table (e.g., `Designtable_optimal_Nrun72.csv`) which contains the factor settings for each `Nexp`.
77 | 2. **Read Results:** It reads your filled-in results file (e.g., `Experiment_results_Nrun72.xlsx`).
78 | 3. **Match 'Nexp':** It uses the `Nexp` column as the key to link the two tables. For each row in the results file, it finds the row in the design table with the *same* `Nexp`.
79 | 4. **Combine:** It creates a new, combined table that includes both the factor settings (from the design table) and the measured results (from the results file) for each experiment run.
80 |
81 | **Sequence Diagram:**
82 |
83 | ```mermaid
84 | sequenceDiagram
85 | participant U as User
86 | participant DEA as DoEgen Analysis (doeval.py)
87 | participant MER as merge_expresults() Function
88 | participant PD as Pandas Library
89 | participant CDT as Combined Data Table
90 |
91 | U->>DEA: Runs analysis with paths to design & result files
92 | DEA->>MER: Calls merge_expresults(result_file, design_file)
93 | MER->>PD: Asks Pandas to read Design CSV file
94 | PD-->>MER: Returns Design DataFrame (Table)
95 | MER->>PD: Asks Pandas to read Results Excel file
96 | PD-->>MER: Returns Results DataFrame (Table)
97 | MER->>PD: Asks Pandas to merge the two tables ON 'Nexp'
98 | PD-->>MER: Returns the combined DataFrame
99 | MER-->>DEA: Returns the merged data
100 | DEA->>CDT: Stores the combined data for analysis
101 | Note right of CDT: Ready for Chapter 6!
102 | ```
103 |
104 | This diagram shows that the `doeval.py` script uses a helper function (`merge_expresults`), which in turn uses the powerful `pandas` library to read the files and perform the merge based on the `Nexp` column.
105 |
106 | ## Diving Deeper into the Code (Simplified View)
107 |
108 | The function responsible for this merging is typically `merge_expresults` inside the `doegen/doeval.py` script. Let's look at a simplified version:
109 |
110 | ```python
111 | # Simplified view from doegen/doeval.py - merge_expresults function
112 | import pandas as pd
113 |
114 | def merge_expresults(fname_result, fname_design, y_label=None):
115 | """
116 | Reads experiment results and merges with the design parameter file.
117 |
118 | Args:
119 | fname_result (str): Path to the experimental results file (Excel).
120 | fname_design (str): Path to the experimental design file (CSV or Excel).
121 | y_label (str, optional): Filter results for a specific Y Label. Defaults to None.
122 |
123 | Returns:
124 | pandas.DataFrame: A combined table with factors and results.
125 | """
126 | try:
127 | # Read the results file (Excel)
128 | print(f"Reading results file: {fname_result}")
129 | dfres = pd.read_excel(fname_result)
130 |
131 | # Read the design file (Can be CSV or Excel)
132 | print(f"Reading design file: {fname_design}")
133 | if fname_design.endswith('.csv'):
134 | dfdes = pd.read_csv(fname_design)
135 | else:
136 | dfdes = pd.read_excel(fname_design)
137 |
138 | # Optional: Filter results for a specific 'Y Label' if provided
139 | if y_label is not None:
140 | print(f"Filtering results for Y Label: {y_label}")
141 | dfres = dfres[dfres["Y Label"] == y_label]
142 |
143 | # --- The Core Merging Step ---
144 | # Use pandas merge function. It looks for common columns ('Nexp' here).
145 | # 'how="left"' means keep all rows from the results (left) table
146 | # and add matching data from the design (right) table.
147 | print(f"Merging results and design based on 'Nexp' column...")
148 | dfcomb = dfres.merge(dfdes, on="Nexp", how="left")
149 |
150 | print(f"Successfully merged data. Combined table has {dfcomb.shape[0]} rows and {dfcomb.shape[1]} columns.")
151 | return dfcomb
152 |
153 | except FileNotFoundError as e:
154 | print(f"Error: File not found - {e}")
155 | return None
156 | except Exception as e:
157 | print(f"Error during merging: {e}")
158 | return None
159 |
160 | # How it might be called inside doeval.py (simplified):
161 | # design_file = "output/Designtable_optimal_Nrun72.csv"
162 | # results_file = "data/Experiment_results_Nrun72.xlsx"
163 | # combined_data = merge_expresults(results_file, design_file)
164 | # if combined_data is not None:
165 | # # Proceed with analysis using combined_data... (Chapter 6)
166 | # pass
167 | ```
168 |
169 | This code snippet shows the key steps:
170 | 1. It uses `pandas.read_excel` and `pandas.read_csv` to load your data into tables (called DataFrames).
171 | 2. It optionally filters the results based on the `Y Label`.
172 | 3. The magic happens with `dfres.merge(dfdes, on="Nexp", how="left")`. This tells pandas: "Take the results table (`dfres`), find the matching `Nexp` row in the design table (`dfdes`), and combine the columns into a single new table."
173 | 4. It returns this combined table, ready for the statistical analysis in the next chapter.
174 |
175 | ## Conclusion
176 |
177 | In this chapter, we focused on the crucial step of preparing your experimental results for `DoEgen`. We learned about the **Experiment Results Excel template** and the importance of structuring your data, especially using the `Nexp` column to link results back to the specific experimental conditions (factors and levels) from your chosen design table.
178 |
179 | We saw that while you manually fill in the results template, the actual **merging** of results with the design plan happens automatically as the first step within the analysis module (`doeval.py`), using the `pandas` library.
180 |
181 | Now that we have a single, combined table containing both the experimental plan *and* the measured outcomes, we are perfectly set up to finally analyze the data and understand the impact of our factors.
182 |
183 | Let's proceed to [Chapter 6: Result Analysis & Statistics
184 | ](06_result_analysis___statistics_.md) to learn how `DoEgen` helps us make sense of these combined results!
185 |
186 | ---
187 |
188 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/06_result_analysis___statistics_.md:
--------------------------------------------------------------------------------
1 | # Chapter 6: Result Analysis & Statistics
2 |
3 | Welcome back! In [Chapter 5: Experiment Result Input & Merging
4 | ](05_experiment_result_input___merging_.md), we successfully combined our experimental plan (the factor settings) with the actual results we measured (`Y Exp` values). We now have a single, rich dataset ready for interrogation!
5 |
6 | But just having the data isn't enough. We need to make sense of it. Which factors actually *mattered*? Which settings led to the best outcomes? This chapter introduces **Result Analysis & Statistics**, the part of `DoEgen` that helps you answer these critical questions.
7 |
8 | ## What's the Big Idea? Finding the Story in Your Data
9 |
10 | Imagine you completed all the test drives from our car fuel economy example (Chapter 5). You diligently recorded the MPG for each car configuration you tested. Now, you have a spreadsheet full of numbers. What next?
11 |
12 | You'd want to analyze this data to find the story:
13 | * **Performance Check:** How good were the results overall? If you knew the "official" MPG ratings (`Y Truth`), how close were your measurements? (This is like calculating accuracy).
14 | * **Key Influencers:** Did changing the `Engine Type` have a huge impact on MPG? What about `Tire Pressure`? Or was `Paint Color` irrelevant? (This is finding factor importance).
15 | * **Top Performers:** Which specific combination of `Engine Type`, `Tire Pressure`, etc., gave the absolute best fuel economy in your tests? (This is identifying the best settings).
16 |
17 | **Result Analysis & Statistics** in `DoEgen` does precisely this kind of analysis automatically. It processes your combined data to extract meaningful insights, helping you understand *what* happened in your experiment and *why*.
18 |
19 | ## Key Analysis Tasks `DoEgen` Performs
20 |
21 | `DoEgen`'s analysis module (`doeval.py`) focuses on several key tasks:
22 |
23 | 1. **Calculating RMSE (if `Y Truth` is available):**
24 | * **What it is:** Root Mean Square Error (RMSE) measures the average difference between your experimental results (`Y Exp`) and the known true values (`Y Truth`).
25 | * **Analogy:** If the car manufacturer stated a car gets 30 MPG (`Y Truth`), and your test drive measured 28 MPG (`Y Exp`), the error is 2 MPG. RMSE calculates an overall "average error" across all your tests.
26 | * **Why it matters:** A lower RMSE indicates your experimental results were closer to the true values, suggesting higher accuracy or a better model fit (in simulations).
27 |
28 | 2. **Determining Factor "Importance":**
29 | * **What it is:** This identifies how much influence each factor has on the outcome (`Y Exp`). It calculates the *range* of the average outcome when changing a factor's levels.
30 | * **Analogy:** If the average MPG for 'Electric' engines was 50 and for 'Petrol' engines was 25, the range (importance) for `Engine Type` is 50 - 25 = 25 MPG (a big impact!). If the average MPG for 'Red' paint was 30.1 and 'Blue' paint was 30.0, the range for `Paint Color` is only 0.1 MPG (very low importance).
31 | * **Why it matters:** Helps you focus on the factors that actually drive the results and ignore the ones that don't make much difference.
32 |
33 | 3. **Identifying Top Performers:**
34 | * **What it is:** If you provided `Y Truth`, `DoEgen` ranks the experiments based on the lowest RMSE (most accurate runs). It lists the factor settings for these top-performing runs.
35 | * **Analogy:** Listing the top 5 car configurations from your test drives that had the smallest difference between your measured MPG and the official MPG rating.
36 | * **Why it matters:** Helps you pinpoint the specific settings that achieved the best (or most accurate) results in your experiment.
37 |
38 | 4. **Correlation Analysis:**
39 | * **What it is:** Examines the relationship between each factor and the outcome (Y). Does increasing Factor X tend to increase Y? Decrease Y? Or have no clear relationship?
40 | * **Analogy:** Does higher `Tire Pressure` generally lead to higher `MPG`?
41 | * **Why it matters:** Gives insights into the direction and strength of the relationship between inputs and outputs.
42 |
43 | ## How to Run the Analysis
44 |
45 | Running the result analysis is straightforward. You'll need:
46 |
47 | 1. Your **Design Table** file (e.g., `Designtable_optimal_Nrun72.csv` from [Chapter 4: Design Selection
48 | ](04_design_selection_.md)).
49 | 2. Your filled-in **Experiment Results** file (e.g., `Experiment_results_Nrun72.xlsx` from [Chapter 5: Experiment Result Input & Merging
50 | ](05_experiment_result_input___merging_.md)).
51 | 3. A **Settings file** for the analysis (usually `settings_expresults.yaml`). This file tells `DoEgen` where to find your input files and where to save the analysis outputs. You can create default templates using `python -m doegen.init_config` if needed.
52 |
53 | Once these are ready, you run the `doeval` module from your terminal:
54 |
55 | ```bash
56 | # Make sure your design and result files are ready
57 | # Make sure your settings file (e.g., settings_expresults.yaml) points to them
58 |
59 | # Run the result evaluation module
60 | python -m doegen.doeval settings_expresults.yaml
61 | ```
62 |
63 | **What does this command do?**
64 | * It tells Python to run the `doeval` module within the `doegen` package.
65 | * It passes the `settings_expresults.yaml` file, which contains the necessary file paths and analysis options.
66 |
67 | **What happens next?**
68 | `DoEgen` will:
69 | 1. Read the settings file.
70 | 2. Read your design table and results file.
71 | 3. **Merge** the two tables based on the `Nexp` column (as discussed in Chapter 5).
72 | 4. Perform the statistical analyses (RMSE, Importance, Correlations, Top Performers) on the merged data.
73 | 5. Save the results as tables (`.csv` files) and plots (`.png` files) in the output directory specified in your settings file.
74 |
75 | ## Understanding the Analysis Outputs
76 |
77 | After running `doeval`, look inside your specified output folder (e.g., `test/expresults/` in the example). You'll find several helpful files for each 'Y Label' you analyzed:
78 |
79 | * **Factor Importance:**
80 | * `Experiment_[Y_Label]_Factorimportance.csv`: A table listing each factor and its calculated importance (Yrange), along with the min, max, mean, and standard deviation of the average Y value across its levels.
81 | * `Ybarplot_[Y_Label].png`: A bar chart visually showing the importance (range) of each factor. Factors with longer bars have a bigger impact.
82 | {width=600}
83 |
84 | * **RMSE and Top Performers (if `Y Truth` was provided):**
85 | * `Experiment_[Y_Label]_RMSE.csv`: The combined data table with an added 'RMSE' column showing the calculated error for each experiment run.
86 | * `Experiment_[Y_Label]_RMSE_TopN_sorted.csv`: A table showing the factor settings for the top N experiments that had the *lowest* RMSE (i.e., the most accurate runs).
87 | {width=600}
88 | * `BestFactor_Avg[Y_Label].png`: A bar chart showing the average factor settings for the top N experiments, weighted by their RMSE. This gives an idea of the optimal settings based on accuracy.
89 | {width=600}
90 |
91 | * **Correlation Plots:**
92 | * `Expresult_correlation_X-Y_[Y_Label].png`: Shows scatter plots of the outcome (Y Exp Mean) versus each *numeric* factor, with a regression line showing the trend. Helps visualize linear relationships.
93 | {width=600}
94 | * `Y-pairwise-correlation_[Y_Label].png`: A "corner plot" showing heatmaps for every pair of factors. The color indicates the average outcome (Y Exp Mean) for that combination of factor levels. Useful for seeing interactions and how combinations affect the result.
95 | {width=600}
96 | * (Similar plots for RMSE vs. factors are also generated if `Y Truth` is available).
97 |
98 | These outputs provide a comprehensive overview of your experiment's results.
99 |
100 | ## What's Happening Under the Hood?
101 |
102 | Let's look at the main steps `doeval.py` takes when you run the analysis command:
103 |
104 | 1. **Read Settings:** Loads the paths and options from your `settings_expresults.yaml` file.
105 | 2. **Merge Data:** Calls the `merge_expresults` function (from `doegen/doeval.py`, discussed in Chapter 5) to read the design (`.csv`) and results (`.xlsx`) files and combine them into a single pandas DataFrame based on the `Nexp` column.
106 | 3. **Calculate Statistics (`calc_expresults_stats`):** This is the core analysis function (in `doegen/doeval.py`). It iterates through each `Y Label` (if you have multiple outcomes):
107 | * Calculates average `Y Exp` and `Y Truth` (if available) for each `Nexp` (handling multiple PIDs if present).
108 | * **Factor Importance:** For each factor, it groups the data by the factor's levels, calculates the average `Y Exp` for each level, and finds the range (max avg - min avg).
109 | * **RMSE:** If `Y Truth` is present, calculates the RMSE for each `Nexp`.
110 | * **Top Performers:** Sorts the results by RMSE and identifies the top N runs. Calculates weighted averages of factor settings for these top runs.
111 | * Saves the calculated statistics to `.csv` files.
112 | 4. **Generate Plots:** Calls various plotting functions (like `plot_3dmap`, `plot_regression`, also in `doegen/doeval.py`) using the calculated statistics and the merged data to create the `.png` visualizations.
113 |
114 | **Sequence Diagram:**
115 |
116 | ```mermaid
117 | sequenceDiagram
118 | participant U as User
119 | participant DEV as doeval.py (Main Script)
120 | participant MER as merge_expresults()
121 | participant CALC as calc_expresults_stats()
122 | participant PLOT as Plotting Functions
123 | participant Files as Output Files (.csv, .png)
124 |
125 | U->>DEV: Runs `python -m doegen.doeval settings.yaml`
126 | DEV->>DEV: Reads settings_expresults.yaml
127 | DEV->>MER: Calls merge_expresults(results_file, design_file)
128 | MER-->>DEV: Returns combined DataFrame (Merged Data)
129 | DEV->>CALC: Calls calc_expresults_stats(Merged Data)
130 | Note right of CALC: Calculates Importance, RMSE, Top N...
131 | CALC-->>DEV: Returns statistics / Modifies DataFrame
132 | DEV->>Files: Saves statistics tables (.csv)
133 | DEV->>PLOT: Calls plotting functions (plot_3dmap, plot_regression...)
134 | PLOT->>Files: Saves plots (.png)
135 | DEV-->>U: Prints "FINISHED" message
136 | ```
137 |
138 | ## Diving Deeper into the Code (Simplified View)
139 |
140 | The main logic resides in `doegen/doeval.py`.
141 |
142 | 1. **Main Execution (`main` function):** Orchestrates the process.
143 |
144 | ```python
145 | # Simplified view from doegen/doeval.py - main function
146 |
147 | import pandas as pd
148 | import yaml
149 | from pathlib import Path # For handling file paths
150 |
151 | # Import helper functions from the same file
152 | from .doeval import merge_expresults, calc_expresults_stats, plot_3dmap, plot_regression # ... other plotting functions
153 |
154 | def main(inpath, fname_results, fname_design, outpath=None):
155 | # --- Setup Paths ---
156 | inpath = Path(inpath)
157 | if outpath is None:
158 | outpath = inpath
159 | else:
160 | outpath = Path(outpath)
161 | outpath.mkdir(parents=True, exist_ok=True) # Create output folder if needed
162 |
163 | # --- 1. Read Design and Results ---
164 | # (Uses pandas internally as shown in Chapter 5)
165 | print("Reading and merging design and result files...")
166 | dfcomb = merge_expresults(inpath / fname_results, inpath / fname_design)
167 | if dfcomb is None:
168 | print("Error during file reading/merging. Exiting.")
169 | return # Stop if merging failed
170 |
171 | # Get design table separately for stats calculation logic
172 | if str(fname_design).endswith('.csv'):
173 | dfdes = pd.read_csv(inpath / fname_design)
174 | else:
175 | dfdes = pd.read_excel(inpath / fname_design)
176 | # Filter out constant factors if any
177 | dfdes = dfdes[dfdes.columns[dfdes.nunique() > 1]].copy()
178 | params = list(dfdes)[1:] # Get factor names
179 |
180 | # Get the unique result types (Y Labels)
181 | try:
182 | ylabels = dfcomb["Y Label"].unique()
183 | except KeyError: # Handle case where 'Y Label' column might be missing
184 | print("Warning: 'Y Label' column not found. Assuming a single result type 'Y1'.")
185 | dfcomb["Y Label"] = 'Y1'
186 | ylabels = dfcomb["Y Label"].unique()
187 |
188 | # --- 2. Calculate Statistics ---
189 | print("Calculating statistics (Importance, RMSE, Top Performers)...")
190 | # Pass the original results DataFrame (dfcomb) and design DataFrame (dfdes)
191 | calc_expresults_stats(ylabels, dfdes, dfcomb, outpath)
192 | # This function saves its own CSV outputs internally
193 |
194 | # --- 3. Generate Plots ---
195 | print("Generating plots...")
196 | for ylabel in ylabels:
197 | print(f" Plotting for Y Label: {ylabel}")
198 | # Reload the RMSE results saved by calc_expresults_stats
199 | try:
200 | df_results_for_ylabel = pd.read_csv(outpath / f"Experiment_{ylabel}_RMSE.csv")
201 | except FileNotFoundError:
202 | print(f"Warning: RMSE file for {ylabel} not found, skipping some plots.")
203 | continue # Skip to next ylabel if file doesn't exist
204 |
205 | # Call plotting functions
206 | plot_3dmap(df_results_for_ylabel, params, "Y Exp Mean",
207 | outpath / f"Y-pairwise-correlation_{ylabel}.png")
208 | plot_regression(df_results_for_ylabel, params, 'Y Exp Mean',
209 | outpath / f"Expresult_correlation_X-Y_{ylabel}.png")
210 |
211 | # Plot RMSE-related plots only if RMSE column exists
212 | if 'RMSE' in df_results_for_ylabel.columns:
213 | plot_3dmap(df_results_for_ylabel, params, "RMSE",
214 | outpath / f"RMSE-pairwise-correlation_{ylabel}.png")
215 | # plot_factordis(df_results_for_ylabel, params, 'RMSE', # Example of another plot
216 | # outpath / f"Expresult_distribution_X-RMSE_{ylabel}.png")
217 |
218 | print("FINISHED Result Analysis")
219 |
220 | # The script uses argparse to read the settings file path from the command line
221 | # and then calls main(**cfg) where cfg is the dictionary loaded from YAML
222 | ```
223 |
224 | 2. **Core Statistics (`calc_expresults_stats`):** This function does the heavy lifting.
225 |
226 | ```python
227 | # Simplified logic inside doegen/doeval.py - calc_expresults_stats function
228 |
229 | import numpy as np
230 | import pandas as pd
231 | import matplotlib.pyplot as plt # Used for the importance bar plot
232 |
233 | def calc_expresults_stats(ylabels, dfdes, dfres, outpath):
234 | params = list(dfdes)[1:] # Factor names
235 | npar = len(params)
236 | nexp = dfdes.shape[0]
237 |
238 | for ylabel in ylabels:
239 | print(f"-- Analyzing Y Label: {ylabel} --")
240 | # --- Prepare Data for this Y Label ---
241 | ydf = dfres[dfres["Y Label"] == ylabel].copy() # Filter results for this Y
242 | # Calculate mean Y values per experiment run (Nexp)
243 | ymean = ydf.groupby("Nexp")["Y Exp"].mean()
244 | ystd = ydf.groupby("Nexp")["Y Exp"].std()
245 | # Add these means to a copy of the design DataFrame
246 | dfdes_y = dfdes.copy()
247 | dfdes_y["Y Exp Mean"] = ymean
248 | dfdes_y["Y Exp Std"] = ystd
249 |
250 | # --- Calculate Factor Importance ---
251 | factor_importance = []
252 | for i, param in enumerate(params):
253 | levels = dfdes_y[param].unique()
254 | avg_y_per_level = []
255 | for level in levels:
256 | # Get average Y for runs where factor 'param' was at 'level'
257 | avg_y = dfdes_y.loc[dfdes_y[param] == level, "Y Exp Mean"].mean()
258 | avg_y_per_level.append(avg_y)
259 | # Importance = Range of average Y values across levels
260 | yrange = np.nanmax(avg_y_per_level) - np.nanmin(avg_y_per_level)
261 | factor_importance.append({
262 | 'Factor': param, 'Yrange': yrange,
263 | 'Ymin': np.nanmin(avg_y_per_level), 'Ymax': np.nanmax(avg_y_per_level),
264 | 'Ymean': np.nanmean(avg_y_per_level), 'Ystd': np.nanstd(avg_y_per_level)
265 | })
266 | # Save importance results
267 | df_importance = pd.DataFrame(factor_importance).set_index('Factor')
268 | df_importance.to_csv(outpath / f"Experiment_{ylabel}_Factorimportance.csv")
269 | # Plot importance bar chart (simplified call)
270 | df_importance.sort_values('Yrange')['Yrange'].plot(kind='barh', title=f'Importance (Range) {ylabel}')
271 | plt.tight_layout()
272 | plt.savefig(outpath / f"Ybarplot_{ylabel}.png")
273 | plt.close()
274 |
275 | # --- Calculate RMSE (if Y Truth exists) ---
276 | if "Y Truth" in ydf.columns and ydf["Y Truth"].notnull().any():
277 | print(" Calculating RMSE...")
278 | ytruemean = ydf.groupby("Nexp")["Y Truth"].mean()
279 | dfdes_y["Y Truth Mean"] = ytruemean
280 | # Calculate squared error for each Nexp
281 | sq_error = (dfdes_y["Y Exp Mean"] - dfdes_y["Y Truth Mean"])**2
282 | # Need to handle potential multiple PIDs per Nexp correctly for RMSE
283 | # (Actual code might need more careful averaging of squared errors before sqrt)
284 | # Simplified: Assume one value per Nexp for RMSE calculation here
285 | rmse = np.sqrt(sq_error) # Simplified: Should average before sqrt if multiple PIDs
286 | dfdes_y["RMSE"] = rmse
287 | # Save combined table with RMSE
288 | dfdes_y.to_csv(outpath / f"Experiment_{ylabel}_RMSE.csv", index=False)
289 |
290 | # --- Identify Top Performers ---
291 | print(" Identifying Top Performers by RMSE...")
292 | nsel = min(10, max(3, nexp // 5)) # Select top ~20%, between 3 and 10
293 | dfsort = dfdes_y.sort_values("RMSE").head(nsel)
294 | dfsort.to_csv(outpath / f"Experiment_{ylabel}_RMSE_Top{nsel}_sorted.csv", index=False)
295 |
296 | # (Actual code also calculates weighted average parameters for top performers
297 | # and plots them using helper functions like weighted_avg_and_std and plot_table)
298 | else:
299 | print(" 'Y Truth' not found or empty, skipping RMSE calculations.")
300 | # Save the table without RMSE if Y Truth was missing
301 | dfdes_y.to_csv(outpath / f"Experiment_{ylabel}_results_summary.csv", index=False)
302 | ```
303 |
304 | These snippets illustrate how `DoEgen` loads data, iterates through factors and outcomes, performs calculations using `pandas` and `numpy`, and generates outputs.
305 |
306 | ## Conclusion
307 |
308 | In this chapter, we dove into **Result Analysis & Statistics**. We learned how `DoEgen` takes the merged experiment plan and results data (from Chapter 5) and processes it to uncover valuable insights.
309 |
310 | We saw how to run the `doeval.py` script and what key analyses it performs: calculating accuracy (RMSE if `Y Truth` is available), determining which factors had the biggest impact (Factor Importance), identifying the best-performing settings, and examining correlations. We also explored the various `.csv` tables and `.png` plots generated, which provide a comprehensive summary of your experiment's findings.
311 |
312 | With this analysis complete, you have a much clearer picture of how your factors influence your outcomes. The plots generated provide powerful visual summaries.
313 |
314 | Now, let's take a closer look at these visualizations in the next chapter. We'll explore how to interpret the different plots `DoEgen` creates in more detail in [Chapter 7: Result Visualization
315 | ](07_result_visualization_.md).
316 |
317 | ---
318 |
319 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/07_result_visualization_.md:
--------------------------------------------------------------------------------
1 | # Chapter 7: Result Visualization
2 |
3 | In the previous chapter, [Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md), we learned how `DoEgen` analyzes the combined experiment plan and results data. It calculated important statistics like factor importance and RMSE, and identified top-performing experiments. This gave us valuable tables full of numbers.
4 |
5 | But sometimes, just looking at tables of numbers can be overwhelming. It's often much easier to understand patterns and trends by looking at pictures! This is where **Result Visualization** comes in.
6 |
7 | ## Why Pictures? Making Sense of Results Visually
8 |
9 | Imagine you just finished your car test drives (from our ongoing example). You have a spreadsheet showing the MPG for every combination of engine, tires, and driving style you tested. You also have the analysis results from Chapter 6, telling you which factors were most "important".
10 |
11 | Now, how do you easily *see* these findings?
12 | * How much *more* important was `Engine Type` compared to `Tire Pressure`? A bar chart makes this comparison instant.
13 | * Does higher `Tire Pressure` *always* lead to better MPG, or does it level off? A scatter plot with a trend line reveals the relationship.
14 | * Does the best `Engine Type` change if you also use `Sporty Tires`? A heatmap showing combinations helps spot these interactions.
15 |
16 | **Result Visualization** in `DoEgen` automatically creates these kinds of **charts and graphs** from your analysis results. It turns the numbers and tables from Chapter 6 into visual summaries, making it much faster and easier to grasp the key takeaways from your experiment.
17 |
18 | ## The Main Visual Tools `DoEgen` Provides
19 |
20 | `DoEgen` generates several types of plots automatically when you run the analysis step ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)). Let's look at the most common ones:
21 |
22 | 1. **Factor Importance Bar Chart (`Ybarplot_[Y_Label].png`)**
23 | * **What it shows:** Compares how much influence each factor had on the outcome (`Y Exp`). Longer bars mean the factor caused a bigger change in the results when its levels were varied.
24 | * **Why it's useful:** Quickly identifies the most impactful factors ("big hitters") and the least impactful ones. Helps you focus on what really matters.
25 |
26 | 2. **Factor vs. Outcome Correlation Plot (`Expresult_correlation_X-Y_[Y_Label].png`)**
27 | * **What it shows:** For each *numeric* factor, it plots the factor's value against the average outcome (`Y Exp Mean`). It also draws a line showing the general trend (linear regression).
28 | * **Why it's useful:** Helps understand the *direction* and *linearity* of the relationship. Does increasing the factor generally increase or decrease the outcome? Is the relationship roughly a straight line?
29 |
30 | 3. **Pairwise Factor Heatmap (`Y-pairwise-correlation_[Y_Label].png`)**
31 | * **What it shows:** A grid ("corner plot") where each square shows the interaction between *two* factors. The color in the square represents the average outcome (`Y Exp Mean`) when those two factors were set to specific levels.
32 | * **Why it's useful:** Excellent for spotting *interactions*. Does the effect of Factor A change depending on the level of Factor B? You can see which combinations lead to high or low outcomes. It works for both numeric and categorical factors.
33 |
34 | 4. **Top Performers Average Settings (`BestFactor_Avg[Y_Label].png`)** *(Only if `Y Truth` was provided)*
35 | * **What it shows:** If you calculated RMSE (accuracy), this plot shows the average settings of the factors for the most accurate (lowest RMSE) experiments.
36 | * **Why it's useful:** Gives a visual clue about the combination of settings that led to the most accurate results in your specific experiment.
37 |
38 | ## How are the Plots Generated?
39 |
40 | You don't need to run a separate command! These plots are created automatically as part of the [Result Analysis & Statistics](06_result_analysis___statistics_.md) process. When you run:
41 |
42 | ```bash
43 | python -m doegen.doeval settings_expresults.yaml
44 | ```
45 |
46 | After calculating the statistics (like importance and correlations), `DoEgen` uses plotting libraries (`matplotlib` and `seaborn`) to generate these `.png` image files and saves them in your output directory (specified in `settings_expresults.yaml`), alongside the `.csv` tables from Chapter 6.
47 |
48 | ## Reading the Visual Story: Interpreting the Plots
49 |
50 | Let's learn how to read the main plots using the examples from the `DoEgen` documentation.
51 |
52 | **1. Factor Importance Bar Chart**
53 |
54 | {width=600}
55 |
56 | * **How to read it:**
57 | * The **Y-axis** lists your factors (parameters).
58 | * The **X-axis** shows the "Importance" or "Range" of the outcome (Y). This is the difference between the average Y value at the factor's highest-impact level and its lowest-impact level.
59 | * **Longer bars** mean the factor had a bigger impact on the outcome during your experiment. Shorter bars mean it had less impact.
60 | * **Example interpretation:** In this plot, `Parameter6` has the longest bar, meaning changing its levels caused the largest variation in the measured outcome (Y). `Parameter1` and `Parameter8` had relatively small impacts. This instantly tells you where to focus your attention if you want to control the outcome Y.
61 |
62 | **2. Factor vs. Outcome Correlation Plot**
63 |
64 | {width=600}
65 |
66 | * **How to read it:**
67 | * This shows a grid of smaller plots, one for each *numeric* factor.
68 | * In each small plot:
69 | * The **X-axis** is the value of the factor.
70 | * The **Y-axis** is the average outcome (`Y Exp Mean`).
71 | * The **dots** represent the average outcome measured at different levels of that factor.
72 | * The **blue line** shows the best straight-line fit through the dots (linear regression).
73 | * The **shaded blue area** shows the uncertainty in that fit.
74 | * The **`r` value** (top left) is the correlation coefficient. `r` close to 1 means strong positive linear correlation (line goes up), `r` close to -1 means strong negative linear correlation (line goes down), `r` close to 0 means weak or no linear correlation.
75 | * **Example interpretation:**
76 | * For `Parameter1`, the line is almost flat and `r` is close to 0, suggesting little linear relationship with Y.
77 | * For `Parameter3`, the line goes downwards and `r` is negative (around -0.5), suggesting that increasing `Parameter3` tends to decrease Y.
78 | * For `Parameter7`, the line goes upwards steeply and `r` is strongly positive (around 0.9), indicating a strong positive linear relationship between `Parameter7` and Y.
79 |
80 | **3. Pairwise Factor Heatmap (Corner Plot)**
81 |
82 | {width=600}
83 |
84 | * **How to read it:**
85 | * This is a grid showing interactions between pairs of factors.
86 | * Look at a specific square, for example, the one in the second row, first column. The **X-axis** corresponds to `Parameter1`, and the **Y-axis** corresponds to `Parameter2`.
87 | * The **colors** inside the square represent the average outcome (`Y Exp Mean`) observed for different combinations of `Parameter1` and `Parameter2` levels.
88 | * The **colorbar** on the right tells you what outcome value each color corresponds to (e.g., blue might be low Y, red might be high Y).
89 | * **Example interpretation:** By looking at the colors in the grid for `Parameter1` vs `Parameter2`, you can see if certain combinations lead to particularly high (red) or low (blue) values of Y. If the color pattern changes drastically across the square, it suggests an *interaction* - the effect of `Parameter1` on Y depends on the level of `Parameter2`. If you see similar plots for RMSE (`RMSE-pairwise-correlation...`), you can see which combinations led to more or less accurate results.
90 |
91 | ## What's Happening Under the Hood? (Simplified)
92 |
93 | Generating these plots involves using the results of the statistical analysis from Chapter 6.
94 |
95 | **Simplified Flow:**
96 |
97 | ```mermaid
98 | sequenceDiagram
99 | participant DEV as doeval.py (Main Script)
100 | participant CALC as calc_expresults_stats() (from Ch 6)
101 | participant PLT as Plotting Libraries (Matplotlib, Seaborn)
102 | participant PlotFuncs as Plotting Functions (plot_3dmap, plot_regression, etc.)
103 | participant Files as Output .png Files
104 |
105 | DEV->>CALC: Calculates statistics (Importance, Means, RMSE...)
106 | CALC-->>DEV: Returns stats / Saves CSVs
107 |
108 | DEV->>PlotFuncs: Calls plot_importance_bar(importance_data)
109 | PlotFuncs->>PLT: Uses Matplotlib to draw bars
110 | PLT-->>Files: Saves Ybarplot_[...].png
111 |
112 | DEV->>PlotFuncs: Calls plot_regression(merged_data)
113 | PlotFuncs->>PLT: Uses Seaborn regplot for each factor
114 | PLT-->>Files: Saves Expresult_correlation_X-Y_[...].png
115 |
116 | DEV->>PlotFuncs: Calls plot_3dmap(merged_data, 'Y Exp Mean')
117 | PlotFuncs->>PLT: Uses Pandas pivot_table & Seaborn heatmap
118 | PLT-->>Files: Saves Y-pairwise-correlation_[...].png
119 |
120 | Note right of DEV: Similar calls for RMSE plots if applicable.
121 | ```
122 |
123 | The `doeval.py` script first calculates the necessary data using `calc_expresults_stats`. Then, it calls specific plotting functions (like `plot_3dmap`, `plot_regression`) defined within `doeval.py`. These functions take the calculated data (often stored in pandas DataFrames) and use commands from the `matplotlib` and `seaborn` libraries to draw the actual plots and save them as `.png` files.
124 |
125 | **Simplified Code Snippets (`doegen/doeval.py`):**
126 |
127 | Let's look at tiny snippets to get the idea.
128 |
129 | 1. **Factor Importance Bar Plot (inside `calc_expresults_stats`)**
130 |
131 | ```python
132 | # Simplified view from doegen/doeval.py - inside calc_expresults_stats
133 |
134 | import matplotlib.pyplot as plt
135 | import pandas as pd
136 | import numpy as np
137 |
138 | def calc_expresults_stats(ylabels, dfdes, dfres, outpath):
139 | params = list(dfdes)[1:] # Factor names
140 | npar = len(params)
141 |
142 | for ylabel in ylabels:
143 | # ... (calculate factor importance 'width' as shown in Ch 6) ...
144 | # df_importance = pd.DataFrame(...) # Contains 'Yrange' for each factor
145 |
146 | # --- Plotting Part ---
147 | plt.ioff() # Turn off interactive display
148 | plt.figure(figsize=(8, 5))
149 | # Sort factors by importance (range)
150 | df_sorted = df_importance.sort_values('Yrange')
151 | # Create horizontal bar plot
152 | plt.barh(
153 | df_sorted.index, # Factor names on Y-axis
154 | width=df_sorted['Yrange'], # Bar length based on importance
155 | # left=ymin_par[sort], # Optional: show min value start
156 | color="red",
157 | )
158 | plt.title(f"Factor Importance (Range) for {ylabel}")
159 | plt.xlabel("Change in Y (Range)")
160 | plt.tight_layout() # Adjust spacing
161 | # Save the plot
162 | plt.savefig(outpath / f"Ybarplot_{ylabel}.png", dpi=300)
163 | plt.close() # Close the plot figure
164 | # ... (rest of the stats calculation) ...
165 | ```
166 | * This code uses `matplotlib.pyplot` (imported as `plt`) to create a horizontal bar chart (`plt.barh`) using the calculated importance range (`Yrange`). It saves the figure using `plt.savefig`.
167 |
168 | 2. **Factor vs. Outcome Correlation Plot (`plot_regression`)**
169 |
170 | ```python
171 | # Simplified view from doegen/doeval.py - plot_regression function
172 |
173 | import matplotlib.pyplot as plt
174 | import seaborn as sns
175 | import pandas as pd
176 | import numpy as np
177 |
178 | def plot_regression(df, params, target_name, fname_out):
179 | """Creates Correlation plot."""
180 | # Select only numeric columns from the factor list
181 | numeric_params = df[params].select_dtypes(include=np.number).columns
182 | nfac = len(numeric_params)
183 | # Determine grid layout for subplots
184 | nax1 = int(np.sqrt(nfac))
185 | nax2 = int(np.ceil(nfac / nax1))
186 |
187 | plt.ioff()
188 | fig = plt.figure(figsize=(nax1 * 4, nax2 * 3)) # Adjust figure size
189 | fig.suptitle(f"Factor vs. {target_name} Correlation", y=1.02) # Add title
190 |
191 | for i, param_name in enumerate(numeric_params):
192 | # Create a subplot in the grid
193 | ax = fig.add_subplot(nax2, nax1, i + 1)
194 | # Use seaborn's regplot to create scatter + regression line
195 | sns.regplot(x=param_name, y=target_name, data=df, ax=ax,
196 | scatter_kws={'s': 10}, # Smaller points
197 | line_kws={'lw': 1}) # Thinner line
198 | # Calculate correlation coefficient
199 | r = df[param_name].corr(df[target_name])
200 | # Add correlation value text to the plot
201 | ax.annotate(f"r = {r:.2f}", xy=(0.05, 0.9), xycoords='axes fraction')
202 | ax.set_title(param_name) # Set title for subplot
203 |
204 | plt.tight_layout(rect=[0, 0, 1, 0.98]) # Adjust layout to prevent title overlap
205 | plt.savefig(fname_out, dpi=300)
206 | plt.close()
207 | ```
208 | * This function uses the `seaborn` library (`sns.regplot`) which is built on top of `matplotlib`. `regplot` automatically creates the scatter plot and fits/draws the regression line for each numeric factor in a loop, placing each one in a subplot grid.
209 |
210 | 3. **Pairwise Heatmap (`plot_3dmap`)**
211 |
212 | ```python
213 | # Simplified view from doegen/doeval.py - plot_3dmap function
214 |
215 | import matplotlib.pyplot as plt
216 | import seaborn as sns
217 | import pandas as pd
218 | import numpy as np
219 |
220 | def plot_3dmap(df, params, target_name, fname_out):
221 | """Plots pairwise heatmap (corner plot)."""
222 | nfac = len(params)
223 | # Find overall min/max of the target for consistent color scaling
224 | vmin = df[target_name].min()
225 | vmax = df[target_name].max()
226 |
227 | plt.ioff()
228 | fig, axs = plt.subplots(nfac - 1, nfac - 1, figsize=(nfac * 1.5, nfac * 1.5)) # Smaller figsize
229 | fig.suptitle(f"Pairwise Factor Heatmap for {target_name}", y=1.02)
230 |
231 | for i in range(nfac - 1): # Index for columns (X-axis factor)
232 | for j in range(i + 1, nfac): # Index for rows (Y-axis factor)
233 | ax = axs[j - 1, i] # Select the correct subplot
234 | try:
235 | # Create a pivot table: average target value for each combination
236 | pivot_data = pd.pivot_table(
237 | df, values=target_name, index=[params[j]], columns=[params[i]],
238 | aggfunc=np.nanmean # Use mean as the aggregation
239 | )
240 | # Draw the heatmap using seaborn
241 | sns.heatmap(
242 | pivot_data, cmap="viridis", # Use a different colormap
243 | annot=False, ax=ax, # No annotations for cleaner look
244 | vmin=vmin, vmax=vmax, # Consistent color scale
245 | square=True, cbar=False # Make squares, no individual color bars
246 | )
247 | except Exception as e:
248 | # Handle cases where pivot might fail (e.g., insufficient data)
249 | print(f"Could not create heatmap for {params[i]} vs {params[j]}: {e}")
250 | sns.heatmap(pd.DataFrame(), ax=ax, cbar=False) # Draw empty plot
251 |
252 | # Clean up axes labels for inner plots
253 | ax.set_xlabel(params[i] if j == nfac -1 else "")
254 | ax.set_ylabel(params[j] if i == 0 else "")
255 | if i > 0: ax.set_yticklabels([])
256 | if j < nfac - 1: ax.set_xticklabels([])
257 |
258 | # Hide unused upper triangle plots
259 | for i in range(nfac - 1):
260 | for j in range(i):
261 | axs[j, i].set_visible(False)
262 |
263 | # Add a single color bar for the whole plot
264 | fig.colorbar(axs[1, 0].collections[0], ax=axs[:, -1], location='right', shrink=0.6)
265 |
266 | plt.tight_layout(rect=[0, 0, 0.9, 0.98]) # Adjust layout
267 | plt.savefig(fname_out, dpi=300)
268 | plt.close()
269 |
270 | ```
271 | * This function iterates through pairs of factors. For each pair, it uses `pandas.pivot_table` to aggregate the `target_name` (e.g., `Y Exp Mean`) for each combination of levels. Then, it uses `seaborn.heatmap` to draw the colored grid representing these average values.
272 |
273 | ## Conclusion
274 |
275 | In this chapter, we explored **Result Visualization**. We saw that `DoEgen` automatically generates various plots like bar charts, correlation plots, and heatmaps during the result analysis phase ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)).
276 |
277 | These visualizations are incredibly helpful because they turn complex tables of numbers into easy-to-understand pictures. We learned how to interpret the key plots to quickly grasp factor importance, understand relationships between factors and outcomes, and identify potential interactions. These visual insights are crucial for communicating your experimental findings effectively.
278 |
279 | We've now walked through the main workflow of `DoEgen`, from defining the experiment to visualizing the results. Throughout this process, we've mentioned using configuration files (like `settings_design.yaml` and `settings_expresults.yaml`) to control `DoEgen`'s behavior. Let's wrap up by looking more closely at how these files work in the final chapter.
280 |
281 | Next up: [Chapter 8: Configuration Handling](08_configuration_handling_.md).
282 |
283 | ---
284 |
285 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/08_configuration_handling_.md:
--------------------------------------------------------------------------------
1 | # Chapter 8: Configuration Handling
2 |
3 | In [Chapter 7: Result Visualization](07_result_visualization_.md), we saw how `DoEgen` turns our analysis results into easy-to-understand plots. Throughout the previous chapters, we've hinted at using "settings files" (like `settings_design.yaml`) to tell `DoEgen` things like where our input files are located or where to save the output. This final chapter dives into how this **Configuration Handling** works.
4 |
5 | ## Why Do We Need Settings Files?
6 |
7 | Imagine you have a TV. You wouldn't want to open up the back and rewire it every time you wanted to change the channel, adjust the volume, or switch the input source! Instead, you use a **remote control** or a **settings menu**.
8 |
9 | **Configuration Handling** in `DoEgen` is exactly like that settings menu or remote control. It provides a way to tell `DoEgen` *how* to run without changing the core Python code itself. This is super important because:
10 |
11 | * **Flexibility:** You might want to run `DoEgen` on different experiments with different input files (`Experiment_setup.xlsx`) located in different folders.
12 | * **Customization:** You might want to change how long `DoEgen` spends searching for the best design (`maxtime_per_run`) or the maximum number of experiments (`nrun_max`) it should consider.
13 | * **Reusability:** You can easily reuse the same `DoEgen` code for various projects just by changing the settings file.
14 |
15 | Instead of hard-coding file paths or run limits directly into the Python scripts (which would be like rewiring the TV!), `DoEgen` reads these settings from simple text files when it starts up.
16 |
17 | ## What is Configuration Handling in `DoEgen`?
18 |
19 | Configuration Handling is the system `DoEgen` uses to manage all its operational settings. Think of it as the **central control panel** for the tool. It controls things like:
20 |
21 | * **Input Files:** Where is the `Experiment_setup.xlsx` file? Where is the `Experiment_results.xlsx` file?
22 | * **Output Location:** Where should `DoEgen` save the generated designs, efficiency reports, analysis results, and plots?
23 | * **Design Generation Parameters:** What's the minimum (`nrun_min`) and maximum (`nrun_max`) number of experiment runs to try generating? How big should the steps (`delta_nrun`) be between run sizes? How much time (`maxtime_per_run`) should be spent optimizing each run size?
24 | * **Analysis Options:** (Though less configurable in the current version, future versions might add options here).
25 |
26 | ## The Tool: Simple YAML Files
27 |
28 | `DoEgen` uses a simple, human-readable file format called **YAML** (often pronounced "yam-ul") for its configuration files. YAML files typically end with the `.yaml` or `.yml` extension.
29 |
30 | YAML is designed to be easy for both humans to read and write, and for computers to parse. It uses indentation (spaces) and key-value pairs separated by a colon (`:`).
31 |
32 | Here's a tiny example of what YAML looks like:
33 |
34 | ```yaml
35 | # This is a comment
36 | project_name: My Cake Baking Experiment
37 | output_directory: /path/to/my/results/
38 | max_runs: 50
39 | use_feature_x: true
40 | ```
41 |
42 | This is much easier to understand than complex code!
43 |
44 | ## `DoEgen`'s Settings Files
45 |
46 | `DoEgen` typically uses two main settings files:
47 |
48 | 1. **`settings_design.yaml`:** Used when running the design generation step ([Chapter 2: Design Generation](02_design_generation_.md)).
49 | 2. **`settings_expresults.yaml`:** Used when running the result analysis step ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)).
50 |
51 | You can create default template files using the command: `python -m doegen.init_config`
52 |
53 | Let's look at simplified examples of what you might find in these files:
54 |
55 | **Example: `settings_design.yaml` (Simplified)**
56 |
57 | ```yaml
58 | # Settings for Design Generation (doegen.doegen)
59 |
60 | # --- Input ---
61 | # Path to the folder containing the setup file
62 | path: 'test/input/'
63 | # Name of the Excel setup file
64 | fname_setup: 'Experiment_setup.xlsx'
65 |
66 | # --- Output ---
67 | # Path where all output folders and files will be saved
68 | outpath: 'test/output/'
69 |
70 | # --- Design Generation Parameters ---
71 | # Maximum number of runs to generate a design for
72 | nrun_max: 150
73 | # Minimum number of runs to start generating from (can be None to let DoEgen calculate)
74 | nrun_min: None
75 | # Step size between run numbers (e.g., generate for 12, 18, 24... if delta=6)
76 | delta_nrun: 6
77 | # Maximum time (seconds) to spend optimizing for EACH run size
78 | maxtime_per_run: 100
79 | ```
80 |
81 | * **Explanation:** This tells `DoEgen` where to find the `Experiment_setup.xlsx` file, where to save everything (`test/output/`), and the parameters controlling the search for designs (generate designs from a calculated minimum up to 150 runs, in steps of 6, spending up to 100 seconds on each size).
82 |
83 | **Example: `settings_expresults.yaml` (Simplified)**
84 |
85 | ```yaml
86 | # Settings for Result Analysis (doegen.doeval)
87 |
88 | # --- Input ---
89 | # Path containing the results and the specific design table used
90 | inpath: 'test/output/'
91 | # Name of the Excel results file you filled in
92 | fname_results: 'Experiment_results_Nrun72.xlsx'
93 | # Name of the specific design table CSV file you actually used for experiments
94 | fname_design: 'Designtable_optimal_Nrun72.csv'
95 |
96 | # --- Output ---
97 | # Path where analysis results (tables, plots) will be saved
98 | outpath: 'test/expresults/'
99 | ```
100 |
101 | * **Explanation:** This tells the analysis script (`doeval`) where to find the results file (`Experiment_results_Nrun72.xlsx`) and the corresponding design table (`Designtable_optimal_Nrun72.csv`). It also specifies that the analysis output should go into the `test/expresults/` folder.
102 |
103 | ## How `DoEgen` Uses the Settings
104 |
105 | When you run a `DoEgen` command, you tell it which settings file to use:
106 |
107 | ```bash
108 | # Running design generation
109 | python -m doegen.doegen settings_design.yaml
110 |
111 | # Running result analysis
112 | python -m doegen.doeval settings_expresults.yaml
113 | ```
114 |
115 | The script (`doegen.py` or `doeval.py`) starts by:
116 | 1. Looking at the command-line argument (`settings_design.yaml` or `settings_expresults.yaml`).
117 | 2. Opening and reading that specific YAML file.
118 | 3. Loading the settings (like `outpath`, `nrun_max`, etc.) into memory.
119 | 4. Using these loaded values throughout its execution to know where files are, how many runs to generate, etc.
120 |
121 | ## Creating and Editing Settings Files
122 |
123 | * **Templates:** `DoEgen` provides template YAML files (you can generate them with `python -m doegen.init_config` or find them in the installation).
124 | * **Editing:** You just need a simple text editor (like Notepad, VS Code, Sublime Text, etc.) to open the `.yaml` file and change the values after the colons (`:`) to match your file locations and desired parameters.
125 | * **Indentation:** Be careful with indentation (spaces at the beginning of lines) if you add more complex structures, as YAML uses it to understand the file structure. For simple key-value pairs like in the examples, indentation is usually not an issue.
126 |
127 | ## How It Works Under the Hood
128 |
129 | `DoEgen` uses standard Python libraries to handle configuration files.
130 |
131 | **High-Level Steps:**
132 |
133 | 1. You run the script, providing the path to your `.yaml` settings file.
134 | 2. The script uses Python's `argparse` library to get the settings file path from the command line.
135 | 3. It uses the `PyYAML` library (or `yaml` for short) to open and read the YAML file.
136 | 4. The `yaml` library parses the text, understanding the `key: value` pairs and the structure.
137 | 5. It converts the YAML content into a Python data structure (usually a dictionary).
138 | 6. The main `DoEgen` script can then easily access the settings by looking up keys in this dictionary (e.g., get the value associated with the key `outpath`).
139 |
140 | **Sequence Diagram:**
141 |
142 | ```mermaid
143 | sequenceDiagram
144 | participant U as User
145 | participant Script as DoEgen Script (e.g., doegen.py)
146 | participant ArgParse as Argument Parser
147 | participant YAMLlib as PyYAML Library
148 | participant Settings as Settings Dictionary
149 |
150 | U->>Script: Runs `python -m doegen.doegen settings_design.yaml`
151 | Script->>ArgParse: Gets settings file path ('settings_design.yaml')
152 | ArgParse-->>Script: Returns the path
153 | Script->>YAMLlib: Asks PyYAML to load the file at the path
154 | YAMLlib->>YAMLlib: Opens and parses 'settings_design.yaml'
155 | YAMLlib-->>Script: Returns settings as a Python dictionary
156 | Script->>Settings: Stores the loaded settings
157 | Script->>Settings: Accesses values (e.g., Settings['outpath']) during execution
158 | ```
159 |
160 | **Code Snippet (Simplified from `configloader.py`):**
161 |
162 | This is how `DoEgen` might load the settings file at the very beginning.
163 |
164 | ```python
165 | # Simplified from doegen/configloader.py or doegen/doegen.py main_cli
166 |
167 | import argparse # Library to handle command-line arguments
168 | import yaml # Library to read YAML files
169 |
170 | def load_settings(default_path='settings_design.yaml'):
171 | """Loads settings from a YAML file specified on the command line."""
172 |
173 | # 1. Set up to read the file path from the command line
174 | parser = argparse.ArgumentParser()
175 | # Allow the user to specify a settings file path, or use the default
176 | parser.add_argument('settings_path', nargs='?', default=default_path)
177 | args = parser.parse_args() # Get the arguments provided by the user
178 |
179 | settings_file_path = args.settings_path
180 | print(f"Using settings from: {settings_file_path}")
181 |
182 | try:
183 | # 2. Open and read the YAML file
184 | with open(settings_file_path) as f:
185 | # Use yaml.safe_load to parse the file safely
186 | cfg = yaml.safe_load(f)
187 |
188 | # 3. Return the loaded settings (as a dictionary)
189 | print("Settings loaded successfully.")
190 | return cfg
191 |
192 | except FileNotFoundError:
193 | print(f"Error: Settings file not found at {settings_file_path}")
194 | return None
195 | except Exception as e:
196 | print(f"Error loading settings file: {e}")
197 | return None
198 |
199 | # --- How the main script might use this ---
200 | # settings = load_settings()
201 | # if settings:
202 | # # Now access values like:
203 | # output_folder = settings['outpath']
204 | # max_runs = settings['nrun_max']
205 | # # ... use these values in the rest of the script ...
206 | # else:
207 | # print("Could not load settings. Exiting.")
208 | ```
209 |
210 | * **Explanation:** This code snippet first uses `argparse` to figure out which settings file the user wants to load (getting the path from the command line). Then, it opens that file and uses `yaml.safe_load()` to read the content and convert it into a Python dictionary called `cfg`. The main script can then use this `cfg` dictionary to get the values for `outpath`, `nrun_max`, etc.
211 |
212 | ## Benefits of Using Configuration Files
213 |
214 | * **Easy Customization:** Change file paths, run limits, etc., without touching the code.
215 | * **Separation of Concerns:** Keeps the "what to do" (code logic) separate from the "how to do it specifically this time" (settings).
216 | * **Reproducibility:** Save your settings file along with your results to remember exactly how an analysis was run.
217 | * **Sharing:** Share settings files easily with collaborators.
218 |
219 | ## Conclusion
220 |
221 | This chapter explored **Configuration Handling** in `DoEgen`. We learned that `DoEgen` uses simple, human-readable **YAML files** (like `settings_design.yaml` and `settings_expresults.yaml`) as its "settings panel" or "remote control". These files allow you to easily specify input/output paths, run limits, and other parameters without modifying the core Python code.
222 |
223 | We saw examples of these files, how `DoEgen` reads them using the `PyYAML` library, and why this approach is beneficial for flexibility and reproducibility.
224 |
225 | This concludes the main tutorial chapters for `DoEgen`! We've journeyed from defining an experiment ([Chapter 1: Experiment Setup Definition](01_experiment_setup_definition_.md)), generating and evaluating designs ([Chapter 2: Design Generation](02_design_generation_.md), [Chapter 3: Design Evaluation & Efficiency Metrics](03_design_evaluation___efficiency_metrics_.md)), selecting the best plan ([Chapter 4: Design Selection](04_design_selection_.md)), inputting results ([Chapter 5: Experiment Result Input & Merging](05_experiment_result_input___merging_.md)), analyzing the outcomes ([Chapter 6: Result Analysis & Statistics](06_result_analysis___statistics_.md)), visualizing the findings ([Chapter 7: Result Visualization](07_result_visualization_.md)), and finally, understanding how to configure the tool's operation ([Chapter 8: Configuration Handling](08_configuration_handling_.md)).
226 |
227 | We hope this tutorial provides a solid foundation for using `DoEgen` to design and analyze your own experiments effectively. Happy experimenting!
228 |
229 | ---
230 |
231 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
--------------------------------------------------------------------------------
/docs/DoEgen_explained/index.md:
--------------------------------------------------------------------------------
1 | # Tutorial: DoEgen
2 |
3 | `DoEgen` is a Python tool designed to help researchers with **Design of Experiments (DoE)**.
4 | It *automates* the creation of efficient experimental plans (*design generation*), helps evaluate how *good* these plans are (*design evaluation*), and suggests the best plans to use (*design selection*).
5 | After running the experiments, `DoEgen` can take the results, combine them with the plan, and perform *statistical analysis* and create *visualizations* to understand which factors are most important and what settings yield the best outcomes.
6 | It uses simple Excel templates for defining experiments and inputting results, and YAML files for configuration.
7 |
8 |
9 | **Source Repository:** [https://github.com/sebhaan/DoEgen](https://github.com/sebhaan/DoEgen)
10 |
11 | ```mermaid
12 | flowchart TD
13 | A0["Experiment Setup Definition
14 | "]
15 | A1["Design Generation
16 | "]
17 | A2["Design Evaluation & Efficiency Metrics
18 | "]
19 | A3["Design Selection
20 | "]
21 | A4["Experiment Result Input & Merging
22 | "]
23 | A5["Result Analysis & Statistics
24 | "]
25 | A6["Result Visualization
26 | "]
27 | A7["Configuration Handling
28 | "]
29 | A0 -- "Defines inputs for" --> A1
30 | A1 -- "Provides design for" --> A2
31 | A2 -- "Provides metrics for" --> A3
32 | A1 -- "Provides design for" --> A4
33 | A4 -- "Provides merged data for" --> A5
34 | A5 -- "Provides data for" --> A6
35 | A7 -- "Configures" --> A1
36 | A7 -- "Configures" --> A5
37 | ```
38 |
39 | ## Chapters
40 |
41 | 1. [Experiment Setup Definition
42 | ](01_experiment_setup_definition_.md)
43 | 2. [Design Generation
44 | ](02_design_generation_.md)
45 | 3. [Design Evaluation & Efficiency Metrics
46 | ](03_design_evaluation___efficiency_metrics_.md)
47 | 4. [Design Selection
48 | ](04_design_selection_.md)
49 | 5. [Experiment Result Input & Merging
50 | ](05_experiment_result_input___merging_.md)
51 | 6. [Result Analysis & Statistics
52 | ](06_result_analysis___statistics_.md)
53 | 7. [Result Visualization
54 | ](07_result_visualization_.md)
55 | 8. [Configuration Handling
56 | ](08_configuration_handling_.md)
57 |
58 |
59 | ---
60 |
61 | Generated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)
62 |
--------------------------------------------------------------------------------
/docs/MANUAL.md:
--------------------------------------------------------------------------------
1 | # DoEgen: A Python Library for Optimised Design of Experiment Generation and Evaluation
2 |
3 | DoEgen is a Python library aiming to assist in generating optimised Design of Experiments (DoE), evaluating design efficiencies, and analysing experiment results.
4 |
5 | In a first step, optimised designs can be automatically generated and efficiencies evaluated for any mixture of factor-levels for numeric and categorical factors. Designs are automatically evaluated as function of number of experiment runs and the most efficient designs are suggested. In particular DoEgen provides computation of a wide range of design efficiencies and allows to import and evaluate externally generated designs as well.
6 |
7 | The second part of DoEgen assists in analysing any derived experiment results in terms of factor importance, correlations, and response analysis for best parameter space selection.
8 |
9 | Written by Sebastian Haan (Sydney Informatics Hub, The University of Sydney)
10 |
11 |
12 | ## Table of Contents
13 | - [Definitions](#definitions)
14 | - [Functionality](#functionality)
15 | - [Installation And Requirements](#installation-and-requirements)
16 | - [Requirements](#requirements)
17 | - [User Templates](#user-templates)
18 | - [Running tests](#running-tests)
19 | - [Documentation](#documentation)
20 | - [Main Modules and Usage](#main-modules-and-usage)
21 | - [Design Generation](#design-generation)
22 | - [Design Efficiencies](#design-efficiencies)
23 | - [Design Selection](#design-selection)
24 | - [Experiment Result Analysis](#experiment-result-analysis)
25 | - [Use Case Study](#use-case-study)
26 | - [Comparison to Other DoE Tools](#comparison-to-other-doe-tools)
27 | - [Literature](#literature)
28 | - [Attribution and Acknowledgments](#attribution-and-acknowledgements)
29 | - [License](#license)
30 |
31 | ## Definitions
32 |
33 | An Experiment Design is typically defined by:
34 |
35 | * Number of Factors: the parameters or variates of the experiment
36 | * Number of Runs: the number of experiments
37 | * Levels: The number of value options for each factor, which can be either numeric values (discrete or continuous) or categorical. Discrete levels for continuous factors can be obtained by providing the minimum and maximum of the factor range and the number of levels. The more levels, the more "fine-grained" the experiment will evaluate this factor, but also more experimental runs are required.
38 |
39 | The goal of optimising an experimental design is to provide an efficient design that is near-optimal in terms of, e.g., orthogonality, level balance, and two-way interaction coverage, yet can be performed with a minimum number of experimental runs, which are often costly or time-consuming.
40 |
41 |
42 | ## Functionality
43 |
44 | If you would like to jumpstart a new experiment and to skip the technical details, you can find a summary of the main usage of DoEgen in [Case Study Use Case].
45 |
46 | Currently, the (preliminary) release contains several functions for generating and evaluating designs. Importing and evaluating external designs is supported (e.g. for comparison to other DoE generator tools). DoE also implements several functions for experiment result analysis and visualisation of parameter space.
47 |
48 | The main functionalities are (sorted in order of typical experiment process):
49 |
50 | * Reading Experiment Setup Table and Settings (Parameter Name, Levels for each factor, Maximum number of runs, Min/Max etc)
51 | * Generating optimised design arrays for a range of runs (given maximum number of runs, and optional computation-time constraints, see `settings_design.yaml`).
52 | * Evaluation and visualisation of more than ten design efficiencies such as level balance, orthogonality, D-efficiencies etc (see [Design Efficiencies] for the complete list).
53 | * Automatic suggestion of minimum, optimal, and best designs within a given range of experiment runs.
54 | * Import and evaluation of externally generated design arrays.
55 | * Experiment result analysis: Template table for experiment results, multi-variant RMSE computation, best model/parameter selection, Factor Importance computation, pairwise response surface and correlation computation, factor correlation analysis and Two-way interaction response plots.
56 | * Visualisation of experiment results.
57 |
58 |
59 | ## Installation And Requirements
60 |
61 |
62 | ### Requirements
63 |
64 | - Python >= 3.6
65 | - SWIG >=3.0.12
66 | - OApackage
67 | - xlrd
68 | - XlsxWriter
69 | - Numpy
70 | - Pandas
71 | - PyYAML
72 | - scikit-learn
73 | - matplotlib
74 | - seaborn
75 |
76 | The DoEgen package is currently considered experimental and has been tested with the libraries specified in `requirements.txt`.
77 |
78 | The OApackage requires an installation of SWIG, which can be found at https://www.dev2qa.com/how-to-install-swig-on-macos-linux-and-windows/ or can be installed via conda
79 |
80 | ```sh
81 | conda install swig
82 | ```
83 |
84 | After installing `swig` and `numpy`, DoEgen can be installed with
85 |
86 | ``` sh
87 | python setup.py build
88 | python setup.py install
89 | ```
90 |
91 | Note that OAPackage can be also installed manually by following installation instructions and documentation for OApackage (tested with
92 | OApackage 2.6.6), which can be found at https://pypi.org/project/OApackage/.
93 |
94 |
95 | ### User Templates
96 |
97 | 1) The factor (parameter) settings of experiment are defined in an experiment setup table (see `Experiment_results_template.xlsx`). A new excel setup template table can be also created with `create_setupfile.py`.
98 | Each factor is on a new row and specified by `Parameter Name`, `Parameter Type` , `Level Number`, `Minimum`, `Maximum`
99 |
100 | 2) After the experiment is run, the results have to be filled in an experiment result table (see `Experiment_results_template.xlsx`). A new excel result template table can be also created with `create_resultfile.py`
101 | The result table allows to fill in multiple output properties (Y_label: output target to be predicted) and experiment positions. The results have to be provided in the table with the following columns:
102 |
103 | * `Nexp`: Run# of experiment, need to match Run# in Experiment setup and design.
104 | * `PID`: Identifier# of label of location (point) in experiment (e.g. if experiment is run at different locations simultaneously).
105 | * `Y Label`: Identifier# or label of Y-Variate (target property that has to be predicted or evaluated, e.g. Rain and Temperature). This allows to include multi-output models with distinct target properties. Note that currently each Y variate is evaluated separately.
106 | * `Y Exp` The experiment result for Y
107 | * `Y Truth` (optional) if the true value available is available for Y. This is required to calculate the RMSE and to select best parameter space.
108 | * Not currently considered (yet) in result stats computation: `Std Y Exp`, `Std Y Truth`, `Weight PID`
109 |
110 | {width=600}
111 |
112 |
113 | {width=600}
114 |
115 |
116 | ### Running Tests
117 |
118 | To verify that DoEgen works, you can run the example experiment
119 |
120 | ``` bash
121 | $ python -m doegen.init_tests
122 | $ python -m doegen.doegen test/settings_design_test.yaml
123 | $ python -m doegen.doeval test/settings_expresults_test.yaml
124 | ```
125 |
126 | ### Documentation
127 |
128 | Please do not modify `README.md`. Instead make any changes in the master documentation file `MANUAL.md` (uses pandoc markdown syntax) and then convert to the inferior Github markdown flavor (note that the new github-flavored markdown format gfm option does not correctly solve figure caption and resize options):
129 | ```bash
130 | pandoc -f markdown -t markdown_github MANUAL.md -o README.md
131 | ```
132 | and to pdf:
133 | ```bash
134 | pandoc -V geometry:margin=1.2in MANUAL.md -o docs/MANUAL.pdf
135 | ```
136 | or as standalone html:
137 | ```bash
138 | pandoc MANUAL.md -o MANUAL.html
139 | ```
140 |
141 | ## Main Modules and Usage
142 |
143 |
144 | ### Design Generation
145 | Design generation with `doegen.py`:
146 | Main model for generating optimised designs and computation of efficiencies.
147 | Settings are specified in settings yaml file `settings_design.yaml`.
148 | If the yaml and .xlsx template files are not yet in your working directory (e.g. after first doegen installation), you can create in the the yaml and excel template files with
149 |
150 | ``` bash
151 | $ python -m doegen.init_config
152 | ```
153 |
154 | Before running `doegen.py`,two things have to be the done:
155 |
156 | 1) fill in experiment setup table (see template provided `Experiment_setup_template.xlsx` or example in `test/` folder)
157 | 2) provide settings in settings file (see `settings_design.yaml`)
158 |
159 | Now you are ready to run the design generation
160 |
161 | ``` bash
162 | $ python -m doegen.doegen settings_design.yaml
163 | ```
164 |
165 | This will produce a number of files for different experiment run length (see folder `test/results/DesignArray_Nrun...`):
166 |
167 | * The optimised design array `EDarray_[factor_levelels]_Nrun.csv`.
168 | * A table of design efficiencies `Efficiencies_[factor_levelels]_Nrun.csv`
169 | * Table of Canonical Correlation Coefficients `Table_Canonical_Correlation.csv`
170 | * Table of two-way Interaction balance `Table_Interaction_Balance.txt`
171 | * Table of Pearson correlation coefficients between all factor pairs `Table_Pearson_Correlation.csv`
172 | * Plot of pairwise correlation including regression fit `pairwise_correlation.png` (see example plot below)
173 |
174 | Besides the default optimisation (based on function `doegen.deogen.optimize_design`), DoEgen also allows the to construct full orthogonal designs using the function `doegen.doegen.gen_highD`, which is based on OApackage orthogonal arrays and extensions. However, this works only for special cases with limited number of factors and design levels. Thus, it is currently not fully automated but might assist advanced users to construct optimal designs.
175 |
176 |
177 | ### Design Selection
178 |
179 | DoEgen will select by default three designs based on the following citeria:
180 |
181 | 1) minimum Design with the criteria:
182 |
183 | * number of runs >= number of factors + 1
184 | * center balance > 95%
185 | * level balance > 95%
186 | * Orthogonal Balance > 90%
187 | * Two Level interaction Balance > 90%
188 | * Two Level Interaction Minimum One = 100%
189 |
190 | 2) optimal Design with the criteria:
191 |
192 | * center balance > 98%
193 | * level balance > 98%
194 | * Orthogonal Balance > 95%
195 | * Two Level interaction Balance > 95%
196 | * Two Level Interaction Minimum One = 100%
197 |
198 | 3) best design which is based on best score that is sum of efficiencies above and includes a small penalty for runsize relative to maximum runsize
199 |
200 | This will deliver (see folder `test/results/`):
201 |
202 | * Overview summary of the three designs and their main efficiencies: `Experiment_Design_selection_summary.txt`
203 | * Three tables (`Designtable_minimum/optimal/best...csv`) for the there suggested designs that are converted in the actual level values
204 | * An overview of the efficiencies is plotted as function of exp run and saved in `Efficiencies_[factor_levels].png`
205 |
206 | In case the user wants to select another design for a different run size, one can covert the design array into a design table with the function `doegen.deogen.array2valuetable()`.
207 |
208 | {width=400}
209 |
210 |
211 | ### Design Efficiencies
212 |
213 | DoEgen computes more than ten efficiencies and saves them as .csv file for each generated design array.
214 | All indicators, except for the canonical correlations, have a range from 0 (worst possible) to 1 (optimal):
215 |
216 | * Center Balance: 100% [1 - Sum(Center-Deviation)/Array Size], i.e. the average center balance over all factors.
217 | * Level Balance: Defined as 100% [1 - Sum(Imbalance)/Array Size], the average level balance over all factors.
218 | * Orthogonality: Defined as 100% [1 - Orthogonality], i.e. the average orthogonality over all factor pairs.
219 | * Two-way Interaction Balance: Similar to level balance but for pairwise factor balance.
220 | * Two-way Interaction with at least one occurrence: 100% [1 - Sum(Not at least one pairwise factor occurrence)/number of pairwise combinations]; 100% if all factor-level pair combinations occur at least once.
221 | * D-Eff: D-Efficiency (model includes main term and quadratic).
222 | * D1 Eff: only main terms
223 | * D2 Eff: main, quadratic, and interaction terms
224 | * A-Eff: A-efficiency (main term and quadratic)
225 | * A1-Eff: only main terms
226 | * A2-Eff: main, quadratic, and interaction terms
227 | * Acor_can_avg: average canonical correlation efficiency
228 | * Acor_can_max: maximal canonical correlation coefficient
229 |
230 | For further inspection, `doegen.deogen.evaluate_design2` creates also the following tables and plots:
231 |
232 | * Table of Canonical Correlation
233 | * Table of Pearson Correlation (same as above if normalised discrete variables)
234 | * Table of Two-way Interaction Balance
235 | * Cornerplot of pairwise factor relation with Y
236 |
237 | {width=600}
238 |
239 |
240 |
241 | ### Experiment Result Analysis
242 |
243 | Experiment Result Analysis with `doeval.py`:
244 | The experiment results have to be provided in a result table with the format as specified in #user-templates, and specifications in the `settings_expresults.yaml` file.
245 | Then run
246 | ``` bash
247 | $ python -m doegen.doeval settings_expresults.yaml
248 | ```
249 | This will create the following stats tables and plots (see folder `test/expresults/` as example):
250 |
251 | * A valuation of the factors in term of "importance", which is defined by the maximum change (range) in the average Y between any factor levels. Results are visualized in bar plot and saved as csv, including, min, max, std deviation across all levels
252 | * Computes RMSE between experiment result and ground truth; results saved as csv.
253 | * Ranks list of top experiments and their parameters based on RMSE
254 | * Computes average and variance of best parameters weighted with RMSE; saved to csv file
255 | * An overview plot of all the correlation plots between Y and each factor (see function `plot_regression`)
256 | * Moreover it will plot Y value for each pairwise combination of factors (see function `plot_3dmap`), which allows the user to visualise categorical factors
257 |
258 | {width=600}
259 |
260 | {width=600}
261 |
262 |
263 |
264 |
265 | ## Use Case Study
266 |
267 | Here we demonstrate a typical use case where we would like to first generate and select an optimal experiment design. Then subsequently after running the experiment we would like to answer the question which is the best parameter space and what parameters are important. Our case study is given by the test example, which consists of 8 factors (parameters) that are specified in the experiment setup table `Experiment_setup_test.xlsx`.
268 |
269 | {width=600}
270 |
271 | The first goal is to generate an efficient design with only a fraction of the entire parameter combination (in our case the full factorial would be $3^6 \times 2^2 = 2916$). The maximum number of experiments (in this case we choose 150) is set in the file `settings_design_test.yaml`, which also specifies input and output directory names, as well as the maximum time for optimising one run (in this case 100 seconds per design optimisation). This configuration will generate and optimize a range of experiments with different design run sizes from 12 to 150, in steps of 6 runsizes (since the lowest common multiple of our mix of 2 and 3 factor levels is 6). Note that the user can also choose a different stepsize, which can done by setting the value in the setting parameter `delta_nrun`.
272 | Now we are all setup to start the experiment design generation and optimisation script, which we do by running the script doegen.py with the settings file as argument:
273 | ``` bash
274 | $ cd DoEgen
275 | $ python -m doegen.doegen test/settings_design_test.yaml
276 | ```
277 | This will generate for each runsize an optimised design array and a list of efficiencies and diagnostic tables and plots (see [Design Generation] for more details). To simplify the selection of the generated experiment designs, DoEgen suggests automatically three designs: 1) one minimum design (lowest number of runs at given efficiency threshold), 2) one optimal design, and 3) one best design (either equal or has larger experiment run number than optimal design). In our case the three design are selected for run numbers 30 (minimum), 72 (optimal), 90 (best). Since the optimal design has basically almost the same efficiencies as the best design (see figure below) but at a lower cost of experiment runs, we choose for our experiment the optimal design, which is given in the table `Designtable_optimal_Nrun72.csv`.
278 |
279 | {width=600}
280 |
281 | {width=600}
282 |
283 | Now it is time to run the experiment. In our example, we produce just some random data for the 72 experiments with 10 sensor locations (PID 1 to 10) and one output variable Y (e.g. temperature). To analyse the experiment, the results have to written in a structured table with the format as given in `experiment_results_Nrun72.xlsx` (see description in figure below).
284 |
285 | {width=600}
286 |
287 | To run the experiment analysis script, settings such as for input output directory names are given in the settings file `settings_expresults_test.yaml`, and we can now run the analysis script with
288 | ``` bash
289 | $ python -m doegen.doeval test/settings_expresults_test.yaml
290 | ```
291 | This analysis produces a range of diagnostic tables and result plots for each output variable Y (in our case we have only one Y). One of the question of this example use case is to identify what factors are important, which is given in the figure `Ybarplot.png`. The "importance" basically indicates how much a factor changes Y (defined by the maximum average change in Y between any levels). This has the advantage to identify also important factors that have either a low linear regression coefficients with Y (see r values in plot `Expresult_correlation_X.png`) or are categorical. Such insight can be valuable to determine, e.g., which factors should be investigated in more detail in a subsequent experiment or to estiamate which factors have no effect on Y.
292 |
293 | {width=600}
294 |
295 | Another important question is what are the best parameter values based on the obtained experiment results so far? This question can be answered by computing the Root-Mean-Square-Error between experiment results and ground truth (or alternatively the likelihood if the model predictions include also uncertainties). Table `Experiment_1_RMSE_Top10_sorted.csv` provides an overview of the top 10 experiments sorted as function of their RMSE. Moroever we can calculate the (RMSE-weighted) average of each factor for the top experiments as shown in bar plot below.
296 |
297 | {width=600}
298 |
299 | {width=600}
300 |
301 | Furthermore, multiple other diagnostics plots such as factor-Y correlation and pairwise correlation maps are generated (see [Experiment Result Analysis] for more details).
302 |
303 |
304 |
305 |
306 | ## Comparison to Other DoE Tools
307 |
308 | The aim of DoEgen is to provide an open-source tool for researchers to create optimised designs and a framework for transparent evaluation of experiment designs. Moreover, DoEgen aims to assist the result analysis that may allow the researcher a subsequent factor selection, parameter fine-tuning, or model building. The design generation function of DoEgen is build upon the excellent package `OApackage` and extends it further in terms of design efficiency evaluation, filtering, automation, and experiment analysis. There are multiple other tools available for DoE; the table below provides a brief (preliminary) summary of the main advantages and disadvantages for each tool that has been tested.
309 |
310 |
311 | Feature | SAS JMP | pyDOE2 | OApackage | DoEgen |
312 | ---------------------------|:---------:|:--------:|:---------:|:--------:|
313 | Open-Source | no (paid) | yes | yes | yes |
314 | Design Optimisation Score | very good | limited | good | good |
315 | Optimal Runsize Finder | no | no | no | yes |
316 | Design Efficiency Eval | yes | no | limited | yes |
317 | Exp Result Analysis | yes | no | no | yes |
318 | Development Stage | advanced | early | moderate |very early|
319 |
320 |
321 |
322 | ## Literature
323 |
324 | [OApackage: A Python package for generation and analysis of orthogonal arrays, optimal designs and conference designs](https://doi.org/10.21105/joss.01097), P.T. Eendebak, A.R. Vazquez, Journal of Open Source Software, 2019
325 |
326 | [pyDOE2: An experimental design package for python](https://github.com/clicumu/pyDOE2)
327 |
328 | Dean, A., Morris, M., Stufken, J. and Bingham, D. eds., 2015. Handbook of design and analysis of experiments (Vol. 7). CRC Press.
329 |
330 | Goos, P. and Jones, B., 2011. Optimal design of experiments: a case study approach. John Wiley & Sons.
331 |
332 | Kuhfeld, W.F., 2010. Discrete choice. SAS Technical Papers, 2010, pp.285-663.
333 |
334 | Zwerina, K., Huber, J. and Kuhfeld, W.F., 1996. A general method for constructing efficient choice designs. Durham, NC: Fuqua School of Business, Duke University.
335 |
336 | Cheong, Y.P. and Gupta, R., 2005. Experimental design and analysis methods for assessing volumetric uncertainties. SPE Journal, 10(03), pp.324-335.
337 |
338 | JMP, A. and Proust, M., 2010. Design of experiments guide. Cary, NC: SAS Institute Inc.
339 |
340 |
341 |
342 | ## Attribution and Acknowledgments
343 |
344 | Acknowledgments are an important way for us to demonstrate the value we bring to your research. Your research outcomes are vital for ongoing funding of the Sydney Informatics Hub.
345 |
346 | If you make use of this code for your research project, please include the following acknowledgment:
347 |
348 | “This research was supported by the Sydney Informatics Hub, a Core Research Facility of the University of Sydney.”
349 |
350 |
351 | ## Contributors
352 |
353 | We would like to thank Dietmar Muller (School of Geophyics, University of Sydney) for suggesting the need for this library, Danial Azam (School of Geophyics, University of Sydney) for testing DOEgen on real-world cases, Christopher Howden (SIH, University of Sydney) for
354 | statistical consultancy, literature suggestions, and documentation
355 | review, and Joel Nothman for the code review.
356 |
357 | DoEgen has benefited from the OApackage library [OApackage](https://github.com/eendebakpt/oapackage) for the design optimisation code and we would like to thank the researchers who have made their code available as open-source.
358 |
359 |
360 | ## License
361 |
362 | Copyright 2020 Sebastian Haan, The University of Sydney
363 |
364 | DoEgen is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License (AGPL version 3) as published by the Free Software Foundation.
365 |
366 | This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details.
367 |
368 | You should have received a copy of the GNU Affero General Public License along with this program (see LICENSE.md). If not, see [https://www.gnu.org/licenses/](https://www.gnu.org/licenses/).
369 |
--------------------------------------------------------------------------------
/docs/MANUAL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/docs/MANUAL.pdf
--------------------------------------------------------------------------------
/doegen/Experiment_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/Experiment_results.xlsx
--------------------------------------------------------------------------------
/doegen/Experiment_setup.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/Experiment_setup.xlsx
--------------------------------------------------------------------------------
/doegen/Experiment_setup_extended.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/Experiment_setup_extended.xlsx
--------------------------------------------------------------------------------
/doegen/__init__.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 | #DoEgen: A Python Library for Optimised Design of Experiment Generation and Evaluation
3 | #
4 | #DoEgen is free software made available under the LGPL License.
5 | #For details see the LICENSE file.
6 | #
7 | #@author: Sebastian Haan
8 |
9 | """
10 | DoEgen is a Python library aiming to assist in generating optimised
11 | Design of Experiments (DoE), evaluating design efficiencies, and
12 | analysing experiment results.
13 |
14 | In a first step, optimised designs can be automatically generated and
15 | efficiencies evaluated for any mixture of factor-levels for numeric and
16 | categorical factors. Designs are automatically evaluated as function of
17 | number of experiment runs and the most efficient designs are suggested.
18 | In particular DoEgen provides computation of a wide range of design
19 | efficiencies and allows to import and evaluate externally generated
20 | designs as well.
21 |
22 | The second part of DoEgen assists in analysing any derived experiment
23 | results in terms of factor importance, correlations, and response
24 | analysis for best parameter space selection.
25 |
26 | Definitions
27 | -----------
28 |
29 | An Experiment Design is typically defined by:
30 |
31 | - Number of Factors: the parameters or variates of the experiment
32 | - Number of Runs: the number of experiments
33 | - Levels: The number of value options for each factor, which can be
34 | either numeric values (discrete or continuous) or categorical.
35 | Discrete levels for continuous factors can be obtained by providing
36 | the minimum and maximum of the factor range and the number of
37 | levels. The more levels, the more fine-grained the experiment will
38 | evaluate this factor, but also more experimental runs are required.
39 |
40 | The goal of optimising an experimental design is to provide an efficient
41 | design that is near-optimal in terms of, e.g., orthogonality, level
42 | balance, and two-way interaction coverage, yet can be performed with a
43 | minimum number of experimental runs, which are often costly or
44 | time-consuming.
45 |
46 | Functionality
47 | -------------
48 |
49 | If you would like to jumpstart a new experiment and to skip the
50 | technical details, you can find a summary of the main usage of DoEgen in
51 | Case Study Use Case in the README.
52 |
53 | Currently, the (preliminary) release contains several functions for
54 | generating and evaluating designs. Importing and evaluating external
55 | designs is supported (e.g. for comparison to other DoE generator tools).
56 | DoE also implements several functions for experiment result analysis and
57 | visualisation of parameter space.
58 |
59 | The main functionalities are (sorted in order of typical experiment
60 | process):
61 |
62 | - Reading Experiment Setup Table and Settings (Parameter Name, Levels
63 | for each factor, Maximum number of runs, Min/Max etc)
64 | - Generating optimised design arrays for a range of runs (given
65 | maximum number of runs, and optional computation-time constrains,
66 | see `settings_design.yaml`).
67 | - Evaluation and visualisation of more than ten design efficiencies
68 | such as level balance, orthogonality, D-efficiencies etc (see
69 | [Design Efficiencies](#design-efficiencies) for the complete list).
70 | - Automatic suggestion of minimum, optimal, and best designs within a
71 | given range of experiment runs.
72 | - Import and evaluation of externally generated design arrays.
73 | - Experiment result analysis: Template table for experiment results,
74 | multi-variant RMSE computation, best model/parameter selection,
75 | Factor Importance computation, pairwise response surface and
76 | correlation computation, factor correlation analysis and Two-way
77 | interaction response plots.
78 | - Visualisation of experiment results.
79 |
80 | Installation And Requirements
81 | -----------------------------
82 |
83 | ### Requirements
84 |
85 | - Python >= 3.6
86 | - OApackage
87 | - xlrd
88 | - XlsxWriter
89 | - openpyxl
90 | - Numpy
91 | - Pandas
92 | - PyYAML
93 | - scikit_learn
94 | - matplotlib
95 | - seaborn
96 |
97 | The DoEgen package is currently considered experimental and has been
98 | tested with the libraries specified in `requirements.txt`.
99 |
100 | Installation instructions and documentation for OApackage (tested with
101 | OApackage 2.7.11) can be found at https://pypi.org/project/OApackage/ or
102 | can be installed with
103 |
104 | pip install OAPackage
105 |
106 | Please see for more details the README.
107 | """
108 |
109 | __version__ = "0.5.0"
110 | __author__ = "Sebastian Haan"
--------------------------------------------------------------------------------
/doegen/configloader.py:
--------------------------------------------------------------------------------
1 | # Load settings
2 |
3 | import argparse
4 | import yaml
5 |
6 | ap = argparse.ArgumentParser()
7 | ap.add_argument('settings_path', nargs='?', default='settings_design.yaml')
8 | args = ap.parse_args()
9 | print(f"using settings in: {args.settings_path!r}")
10 | with open(args.settings_path) as f:
11 | cfg = yaml.safe_load(f)
12 | for key in cfg:
13 | locals()[str(key)] = cfg[key]
--------------------------------------------------------------------------------
/doegen/configloader_results.py:
--------------------------------------------------------------------------------
1 | # Load settings
2 |
3 | import argparse
4 | import yaml
5 |
6 | ap = argparse.ArgumentParser()
7 | ap.add_argument('settings_path', nargs='?', default='settings_expresults.yaml')
8 | args = ap.parse_args()
9 | print(f"using settings in: {args.settings_path!r}")
10 | with open(args.settings_path) as f:
11 | cfg = yaml.safe_load(f)
12 | for key in cfg:
13 | locals()[str(key)] = cfg[key]
--------------------------------------------------------------------------------
/doegen/create_resultfile.py:
--------------------------------------------------------------------------------
1 | """
2 | Generates Excel file with format for experimental design setup
3 |
4 | Author: Sebastian Haan
5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney
6 | Version: 0.1
7 | License: APGL-3.0
8 | """
9 |
10 | import xlsxwriter
11 |
12 | workbook = xlsxwriter.Workbook('Experiment_results_template.xlsx')
13 | worksheet = workbook.add_worksheet()
14 |
15 | workbook.set_properties({
16 | 'title': 'Experimental Design Results',
17 | 'subject': 'Template',
18 | 'author': 'Sebastian Haan',
19 | 'company': 'SIH, The University of Sydney',
20 | 'comments': 'Created with Python and XlsxWriter'
21 | })
22 |
23 | # Add a format for the header cells.
24 | header_format = workbook.add_format({
25 | 'border': 2,
26 | 'bg_color': '#C6EFCE',
27 | 'bold': True,
28 | 'text_wrap': True,
29 | 'valign': 'bottom',
30 | 'indent': 1,
31 | 'locked': True
32 | })
33 |
34 | unlocked = workbook.add_format({'locked': False})
35 | # Enable worksheet protection
36 | #worksheet.protect(options={'autofilter': True})
37 | #worksheet.autofilter('A1:B8')
38 |
39 | worksheet.protect()
40 |
41 | header_format.set_font_size(14)
42 |
43 | worksheet.set_default_row(20)
44 | worksheet.set_row(0, 20)
45 |
46 | worksheet.set_column('A:H', 15, unlocked)
47 |
48 | # Write the header cells
49 | # Same identifier as in setupfile, this need to match with experimemnt setup file to merge with associated parameters!
50 | heading1 = 'Nexp'
51 | # Optional: ID of measurement point (e.g. spatial or temporal position),
52 | heading2 = 'PID'
53 | # Optional: index of multi output-target if applicable (optional)
54 | #(e.g. Y can be distinct properties or target values even for same PID)
55 | heading3 = 'Y Label'
56 | # Experiment or simulation result for given position PID and output Y Label
57 | heading4 = 'Y Exp'
58 | # Optional: ground truth for given PID and output Y Label
59 | heading5 = 'Y Truth'
60 | # Optional: Standard deviation (noise) of experiment result for given position PID and output Y Label
61 | heading6 = 'Std Y Exp'
62 | # Optional: Standard deviation (noise) of ground truth for given position PID and output Y Label
63 | heading7 = 'Std Y Truth'
64 | # Optional: weight for positional measurement with PID
65 | heading8 = 'Weight PID'
66 |
67 | worksheet.write('A1', heading1, header_format)
68 | worksheet.write('B1', heading2, header_format)
69 | worksheet.write('C1', heading3, header_format)
70 | worksheet.write('D1', heading4, header_format)
71 | worksheet.write('E1', heading5, header_format)
72 | worksheet.write('F1', heading6, header_format)
73 | worksheet.write('G1', heading7, header_format)
74 | worksheet.write('H1', heading8, header_format)
75 |
76 | #Freeze panes
77 | worksheet.freeze_panes(1, 0) # Freeze the first row
78 |
79 | workbook.close()
80 |
81 | print('Excel Template Created')
--------------------------------------------------------------------------------
/doegen/create_setupfile.py:
--------------------------------------------------------------------------------
1 | """
2 | Generates Excel file with format for experimental design setup
3 |
4 | Author: Sebastian Haan
5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney
6 | Version: 0.1
7 | License: LGPL-3.0
8 | """
9 |
10 | import xlsxwriter
11 |
12 | workbook = xlsxwriter.Workbook('Experiment_setup_template.xlsx')
13 | worksheet = workbook.add_worksheet()
14 |
15 | workbook.set_properties({
16 | 'title': 'Experimental Design Setup',
17 | 'subject': 'Template',
18 | 'author': 'Sebastian Haan',
19 | 'company': 'SIH, The University of Sydney',
20 | 'comments': 'Created with Python and XlsxWriter'
21 | })
22 |
23 | # Add a format for the header cells.
24 | header_format = workbook.add_format({
25 | 'border': 2,
26 | 'bg_color': '#C6EFCE',
27 | 'bold': True,
28 | 'text_wrap': True,
29 | 'valign': 'bottom',
30 | 'indent': 1,
31 | 'locked': True
32 | })
33 |
34 | unlocked = workbook.add_format({'locked': False})
35 | # Enable worksheet protection
36 | #worksheet.protect(options={'autofilter': True})
37 | #worksheet.autofilter('A1:B8')
38 |
39 | worksheet.protect()
40 |
41 | header_format.set_font_size(14)
42 |
43 | worksheet.set_default_row(20)
44 | worksheet.set_row(0, 30)
45 |
46 | worksheet.set_column('A:E', None, unlocked)
47 |
48 | # Set up layout of the worksheet.
49 | worksheet.set_column('A:A', 50, unlocked)
50 | worksheet.set_column('B:B', 20, unlocked)
51 | worksheet.set_column('C:C', 20, unlocked)
52 | worksheet.set_column('D:D', 20, unlocked)
53 | worksheet.set_column('E:E', 20, unlocked)
54 |
55 |
56 | # Write the header cells and some data that will be used in the examples.
57 | heading1 = 'Parameter Name'
58 | heading2 = 'Parameter Type'
59 | heading3 = 'Level Number'
60 | heading4 = 'Minimum'
61 | heading5 = 'Maximum'
62 |
63 | worksheet.write('A1', heading1, header_format)
64 | worksheet.write('B1', heading2, header_format)
65 | worksheet.write('C1', heading3, header_format)
66 | worksheet.write('D1', heading4, header_format)
67 | worksheet.write('E1', heading5, header_format)
68 |
69 | #worksheet.write_row('B2:B10', ['Continous', 'Discrete', 'Categorical'])
70 | worksheet.data_validation('B2:B20', {'validate': 'list',
71 | 'source': ['Continuous', 'Discrete', 'Categorical']})
72 | #worksheet.write_row('C2:C10', ['Integers', 2, 10])
73 | worksheet.data_validation('C2:C20', {'validate': 'integer',
74 | 'criteria': 'between',
75 | 'minimum': 2,
76 | 'maximum': 10})
77 |
78 | #Freeze panes
79 | worksheet.freeze_panes(1, 0) # Freeze the first row
80 |
81 | workbook.close()
--------------------------------------------------------------------------------
/doegen/create_setupfile_extended.py:
--------------------------------------------------------------------------------
1 | """
2 | Generates Excel file with format for experimental design setup
3 |
4 | Author: Sebastian Haan
5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney
6 | Version: 0.1
7 | License: LGPL-3.0
8 | """
9 |
10 | import xlsxwriter
11 |
12 | workbook = xlsxwriter.Workbook('Experiment_setup_extended.xlsx')
13 | worksheet = workbook.add_worksheet()
14 |
15 | workbook.set_properties({
16 | 'title': 'Experimental Design Setup (Extended Version)',
17 | 'subject': 'Template',
18 | 'author': 'Sebastian Haan',
19 | 'company': 'SIH, The University of Sydney',
20 | 'comments': 'Created with Python and XlsxWriter'
21 | })
22 |
23 | # Add a format for the header cells.
24 | header_format = workbook.add_format({
25 | 'border': 2,
26 | 'bg_color': '#C6EFCE',
27 | 'bold': True,
28 | 'text_wrap': True,
29 | 'valign': 'bottom',
30 | 'indent': 1,
31 | 'locked': True
32 | })
33 |
34 | unlocked = workbook.add_format({'locked': False})
35 | # Enable worksheet protection
36 | #worksheet.protect(options={'autofilter': True})
37 | #worksheet.autofilter('A1:B8')
38 |
39 | worksheet.protect()
40 |
41 | header_format.set_font_size(14)
42 |
43 | worksheet.set_default_row(20)
44 | worksheet.set_row(0, 30)
45 |
46 | worksheet.set_column('A:E', None, unlocked)
47 |
48 | # Set up layout of the worksheet.
49 | worksheet.set_column('A:A', 40, unlocked)
50 | worksheet.set_column('B:B', 20, unlocked)
51 | worksheet.set_column('C:C', 20, unlocked)
52 | worksheet.set_column('D:D', 20, unlocked)
53 | worksheet.set_column('E:E', 20, unlocked)
54 | worksheet.set_column('F:F', 20, unlocked)
55 | worksheet.set_column('G:G', 50, unlocked)
56 |
57 |
58 | # Write the header cells and some data that will be used in the examples.
59 | heading1 = 'Parameter Name'
60 | heading2 = 'Parameter Type'
61 | heading3 = 'Level Number'
62 | heading4 = 'Minimum'
63 | heading5 = 'Maximum'
64 | heading6 = 'Include (Y/N)'
65 | heading7 = 'Levels'
66 |
67 | worksheet.write('A1', heading1, header_format)
68 | worksheet.write('B1', heading2, header_format)
69 | worksheet.write('C1', heading3, header_format)
70 | worksheet.write('D1', heading4, header_format)
71 | worksheet.write('E1', heading5, header_format)
72 | worksheet.write('F1', heading6, header_format)
73 | worksheet.write('G1', heading7, header_format)
74 |
75 | #worksheet.write_row('B2:B10', ['Continous', 'Discrete', 'Categorical'])
76 | worksheet.data_validation('B2:B20', {'validate': 'list',
77 | 'source': ['Continuous', 'Discrete', 'Categorical']})
78 | #worksheet.write_row('C2:C10', ['Integers', 2, 10])
79 | worksheet.data_validation('C2:C20', {'validate': 'integer',
80 | 'criteria': 'between',
81 | 'minimum': 2,
82 | 'maximum': 10})
83 | worksheet.data_validation('F2:F20', {'validate': 'list',
84 | 'source': ['Yes', 'No']})
85 |
86 | #Freeze panes
87 | worksheet.freeze_panes(1, 0) # Freeze the first row
88 |
89 | workbook.close()
--------------------------------------------------------------------------------
/doegen/doeval.py:
--------------------------------------------------------------------------------
1 | """
2 | Package to evaluate the response and factor effectiveness of experiment results.
3 |
4 | Author: Sebastian Haan
5 | Affiliation: Sydney Informatics Hub (SIH), THe University of Sydney
6 | Version: Experimental
7 | License: LGPL-3.0
8 |
9 | Tested with Python 3.7
10 |
11 | Main Capabilities:
12 | - Multi-variant RMSE computation and best model/parameter selection
13 | - Factor Importance computation
14 | - Pairwise response surface and correlation computation
15 | - Factor correlation analysis and Two -way intearction response plots
16 | - Visualisation plots
17 |
18 | ToDo:
19 | Change to Pathlib
20 |
21 | Changes to previous version:
22 | - replace configloader with function arguments
23 | """
24 |
25 | import os
26 | import sys
27 | import argparse
28 | import yaml
29 | from pathlib import Path
30 | import numpy as np
31 | import pandas as pd
32 | from mpl_toolkits import mplot3d
33 | import matplotlib.pyplot as plt
34 | import seaborn as sns
35 | import matplotlib.ticker as ticker
36 |
37 | sns.set()
38 | # Load settings parameters
39 |
40 |
41 | def merge_expresults(fname_result, fname_design, y_label=None):
42 | """
43 | Reads experiment results into pandas dataframe
44 | and merges with paramater file
45 |
46 | INPUT
47 | fname_result: path + filenmae of experimental results (see excel template)
48 | fname_design: path + filenmae of experimental design setip (see excel template)
49 | y_label: (Default None) Column name for precited y property
50 | """
51 | dfres = pd.read_excel(fname_result)
52 | dfdes = pd.read_excel(fname_design)
53 | if y_label is not None:
54 | dfres = dfres[dfres["Y Label"] == y_label]
55 | # Merge two files:
56 | dfcomb = dfres.merge(dfdes, on="Nexp", how="left")
57 | return dfcomb
58 |
59 |
60 | def create_testdata(outpath, fname_out, Nexp):
61 | """
62 | # Script for creating random set of results for testing
63 |
64 | INPUT
65 | outpath: output directory
66 | fname_out: filename in format '*.xlsx'
67 | Nexp: NUmber of experiments
68 | """
69 | os.makedirs(outpath, exist_ok=True)
70 | PID = np.arange(1, 11)
71 | Yexp = np.random.rand(Nexp, len(PID)).flatten()
72 | Ytruth = np.random.rand(Nexp, len(PID)).flatten()
73 | Ylabel = np.ones(len(Yexp))
74 | aNexp = np.zeros_like(Yexp)
75 | aPID = np.zeros_like(Yexp)
76 | for i in range(Nexp):
77 | aNexp[10 * i : 10 * i + 10] = i + 1
78 | aPID[10 * i : 10 * i + 10] = PID
79 | array = np.vstack(
80 | (
81 | aNexp,
82 | aPID,
83 | Ylabel,
84 | Yexp,
85 | Ytruth,
86 | Ytruth * np.nan,
87 | Ytruth * np.nan,
88 | Ytruth * np.nan,
89 | )
90 | )
91 | header = [
92 | "Nexp",
93 | "PID",
94 | "Y Label",
95 | "Y Exp",
96 | "Y Truth",
97 | "Std Y Exp",
98 | "Std Y Truth",
99 | "Weight PID",
100 | ]
101 | df = pd.DataFrame(array.T, columns=header)
102 | df.to_excel(os.path.join(outpath,fname_out), index=False)
103 |
104 |
105 | def weighted_avg_and_std(values, weights):
106 | """
107 | Returns weighted average and standard deviation.
108 |
109 | INPUT
110 | values, weights -- arrays with the same shape.
111 | """
112 | average = np.average(values, weights=weights)
113 | # Fast and numerically precise:
114 | variance = np.average((values - average) ** 2, weights=weights)
115 | return (average, np.sqrt(variance))
116 |
117 |
118 | def calc_expresults_stats(ylabels, dfdes, dfres, outpath):
119 | """
120 | Computation of statistical evaluation of experimetal results for each predicted y:
121 | 1) Parameter importance, which is defined by maximum y range over parameter levels (y in averaged for each level)
122 | Results are visualized in bar plot and saved as csv, including, min, max, std devioation across all levels
123 | 2) Computes RMSE and saves results as csv
124 | 3) Computes list of top experiments and their parameters
125 | 4) Computes average and variance of best parameters weighted with RMSE; saved to csv file
126 |
127 | INPUT
128 | ylabels: label ID for each target variable.
129 | dfdes: experiment design dataframe (inlcudes one column Nexp and the other columns the factor names)
130 | dfres: experiment result dataframe (Ids in Nexp column must match design array)
131 | outpath: path for output files
132 | """
133 | npar = len(list(dfdes)) - 1
134 | nexp = len(dfdes)
135 | params = list(dfdes)[1:]
136 | for ylabel in ylabels:
137 | dfdes_y = dfdes.copy()
138 | # Initialise array for factor results
139 | ymin_par = np.full(npar, np.nan)
140 | ymax_par = np.zeros(npar) * np.nan
141 | ystd_par = np.zeros(npar) * np.nan
142 | ymean_par = np.zeros(npar) * np.nan
143 | # Select Y data and to dfdes overall stats in dfdes
144 | ydf = dfres[dfres["Y Label"] == ylabel].copy()
145 | ymean = ydf.fillna(0).groupby("Nexp")["Y Exp"].mean()
146 | ystd = ydf.fillna(0).groupby("Nexp")["Y Exp"].std()
147 | ytruemean = ydf.fillna(0).groupby("Nexp")["Y Truth"].mean()
148 | ytruestd = ydf.fillna(0).groupby("Nexp")["Y Truth"].std()
149 | assert len(ymean) == dfdes_y.shape[0]
150 | dfdes_y["Y Exp Mean"] = ymean.values
151 | dfdes_y["Y Exp Std"] = ystd.values
152 | dfdes_y["Y Truth Mean"] = ytruemean.values
153 | dfdes_y["Y Truth Std"] = ytruestd.values
154 | # Loop over parameter to caluclate factor range, min, max, mean and stddev:
155 | for i, param in enumerate(params):
156 | levels = dfdes[param].unique()
157 | ylevel = [
158 | np.nanmean(dfdes_y.loc[dfdes[param] == level, "Y Exp Mean"])
159 | for level in levels
160 | ]
161 | ylevelstd = [
162 | np.nanmean(dfdes_y.loc[dfdes[param] == level, "Y Exp Std"])
163 | for level in levels
164 | ]
165 | ymin_par[i] = np.nanmin(ylevel)
166 | ymax_par[i] = np.nanmax(ylevel)
167 | ystd_par[i] = np.nanstd(ylevel)
168 | ymean_par[i] = np.nanmean(ylevel)
169 | ypos = np.arange(npar)
170 | width = ymax_par - ymin_par
171 | sort = np.argsort(width)
172 | # Plot factor importance as barplot
173 | plt.ioff()
174 | plt.figure(figsize=(8, 5))
175 | plt.barh(
176 | ypos,
177 | width=width[sort],
178 | left=ymin_par[sort],
179 | tick_label=np.asarray(params)[sort],
180 | color="red",
181 | )
182 | plt.title("Range " + str(ylabel))
183 | plt.tight_layout()
184 | plt.savefig(os.path.join(outpath, "Ybarplot_" + str(ylabel) + ".png"), dpi=300)
185 | plt.close()
186 | # Save factor importance to csv:
187 | res = np.vstack((width, ymin_par, ymax_par, ymean_par, ystd_par))
188 | dfrange = pd.DataFrame(
189 | res.T, columns=["Yrange", "Ymin", "Ymax", "Ymean", "Ystd"], index=params
190 | )
191 | dfrange.to_csv(
192 | os.path.join(outpath, "Experiment_" + str(ylabel) + "_Factorimportance.csv")
193 | )
194 |
195 | # Calculate RMSE and best parameter space:
196 | if ydf["Y Truth"].notnull().any():
197 | rmse = np.zeros(nexp)
198 | ytrue = np.zeros(nexp)
199 | for i in range(nexp):
200 | resid = (
201 | ydf.loc[ydf["Nexp"] == i + 1, "Y Exp"]
202 | - ydf.loc[ydf["Nexp"] == i + 1, "Y Truth"]
203 | )
204 | rmse[i] = np.sqrt(np.nanmean(resid ** 2))
205 | dfdes_y["RMSE"] = rmse
206 | # Save overall results to csv with sorted RMSE
207 | dfdes_y.to_csv(os.path.join(outpath, "Experiment_" + str(ylabel) + "_RMSE.csv"))
208 |
209 | # Calculate best parameters (for only nueric parameters)
210 | if nexp >= 20:
211 | nsel = 10
212 | elif (nexp >= 10) & (nexp < 20):
213 | nsel = 5
214 | else:
215 | nsel = 3
216 | dfsort = dfdes_y.sort_values(["RMSE"], ascending=True)
217 | print(
218 | "Top "
219 | + str(nsel)
220 | + " experiments with best RMSE for "
221 | + str(ylabel)
222 | + " :"
223 | )
224 | print(dfsort.head(nsel))
225 | dfsort.iloc[0:nsel].to_csv(
226 | os.path.join(outpath,
227 | "Experiment_"
228 | + str(ylabel)
229 | + "_RMSE_Top"
230 | + str(nsel)
231 | + "_sorted.csv")
232 | )
233 | """ takingh out best parameter weighting since averaging might be misleading
234 | # best parameter space is based on weighted RMSE of top results
235 | # Note that these are average parameter estimaets and are not considering multi-modal distributions
236 | # For multi-modal see list of top resulst
237 | # Select only numeric parameters
238 | params_num = dfsort[params]._get_numeric_data().columns
239 | param_wmean = np.zeros(len(params_num)) # weigthed mean
240 | param_wstd = np.zeros(len(params_num)) # weighted std
241 | dfsel = dfsort.iloc[0:nsel]
242 | for i, param in enumerate(params_num):
243 | param_wmean[i], param_wstd[i] = weighted_avg_and_std(
244 | dfsel[param].values, 1 / (dfsel["RMSE"].values ** 2)
245 | )
246 | params_stats = np.vstack((param_wmean, param_wstd))
247 | # Save to csv
248 | dfparam_avg = pd.DataFrame(
249 | params_stats.T,
250 | index=params_num,
251 | columns=["Weighted Average", "Weigted Stddev"],
252 | )
253 | dfparam_avg.to_csv(
254 | cfg.outpath + "Experiment_" + str(ylabel) + "_Best-Parameter-Avg.csv"
255 | )
256 | # plot dataframe table
257 | plot_table(dfparam_avg, cfg.outpath, "BestFactor_Avg_" + str(ylabel) + ".png")
258 | """
259 | """
260 | dfparam_avg.plot(kind="bar", y="Weighted Average", yerr="Weigted Stddev")
261 | plt.tight_layout()
262 | """
263 |
264 |
265 |
266 | #clean up x and y axis plots if there are too many decimal points or scientific notation.
267 | def nicexAxis(ax):
268 | for item in ax.get_xticklabels():
269 | val = item.get_text()
270 | if 'e' in val:
271 | ax.set_xticklabels(['{:.3g}'.format(float(label)) for label in [item.get_text() for item in ax.get_xticklabels()]])
272 | else:
273 | ax.set_xticklabels([str(round(float(label), 3)) for label in [item.get_text() for item in ax.get_xticklabels()]])
274 | return ax
275 |
276 | def niceyAxis(ax):
277 | for item in ax.get_yticklabels():
278 | val = item.get_text()
279 | if 'e' in val:
280 | ax.set_yticklabels(['{:.3g}'.format(float(label)) for label in [item.get_text() for item in ax.get_yticklabels()]])
281 | else:
282 | ax.set_yticklabels([str(round(float(label), 3)) for label in [item.get_text() for item in ax.get_yticklabels()]])
283 | return ax
284 |
285 | # Make 3d correlation plot with heatmap
286 | # (Make 3d scatter to image plot (works only for continous))
287 | def plot_3dmap(df, params, target_name, fname_out):
288 | nfac = len(params)
289 | # Check first for max and min value
290 | ymin0 = df[target_name].max()
291 | ymax0 = df[target_name].min()
292 | for i in range(nfac - 1):
293 | for j in range(i + 1, nfac):
294 | table = pd.pivot_table(
295 | df,
296 | values=target_name,
297 | index=[params[j]],
298 | columns=[params[i]],
299 | aggfunc=np.nanmean,
300 | )
301 | if np.min(table.min()) < ymin0:
302 | ymin0 = np.min(table.min())
303 | if np.max(table.max()) > ymax0:
304 | ymax0 = np.max(table.max())
305 |
306 | # Make corner plot
307 | plt.ioff() # automatic disables display of figures
308 | fig, axs = plt.subplots(nfac - 1, nfac - 1, figsize=(nfac * 2, nfac * 2))
309 | for i in range(nfac - 1):
310 | for j in range(i + 1, nfac):
311 | table = pd.pivot_table(
312 | df,
313 | values=target_name,
314 | index=[params[j]],
315 | columns=[params[i]],
316 | aggfunc=np.nanmean,
317 | )
318 | g = sns.heatmap(
319 | table,
320 | cmap="Spectral",
321 | annot=False,
322 | ax=axs[j - 1, i],
323 | vmin=ymin0,
324 | vmax=ymax0,
325 | square=True,
326 | cbar=False,
327 | )
328 |
329 | if params[j] == 'N' or params[j] == 'Erodibility' or params[j] == 'MNrat':
330 | axs[j - 1, i] = niceyAxis(axs[j - 1, i])
331 |
332 | if params[i] == 'N' or params[i] == 'Erodibility' or params[i] == 'MNrat':
333 | axs[j - 1, i] = nicexAxis(axs[j - 1, i])
334 |
335 | if i > 0:
336 | g.set_ylabel("")
337 | g.set(yticklabels=[])
338 | if j < nfac - 1:
339 | g.set_xlabel("")
340 | g.set(xticklabels=[])
341 | # remove remaining plots:
342 | for i in range(1, nfac - 1):
343 | for j in range(1, i + 1):
344 | g = sns.heatmap(
345 | table * np.nan,
346 | cmap="Spectral",
347 | annot=False,
348 | ax=axs[j - 1, i],
349 | vmin=ymin0,
350 | vmax=ymax0,
351 | square=True,
352 | cbar=False,
353 | )
354 | g.set_ylabel("")
355 | g.set(yticklabels=[])
356 | g.set_xlabel("")
357 | g.set(xticklabels=[])
358 |
359 | # Make colorbar
360 | g = sns.heatmap(
361 | table * np.nan,
362 | cmap="Spectral",
363 | annot=False,
364 | ax=axs[0, 1],
365 | vmin=ymin0,
366 | vmax=ymax0,
367 | square=True,
368 | cbar=True,
369 | )
370 | g.set_xlabel("")
371 | g.set_ylabel("")
372 | g.set(yticklabels=[])
373 | g.set(xticklabels=[])
374 | print(target_name) ##debug
375 | fig.suptitle("Pair-Variate Plot for "+str(target_name)+" Function")
376 | plt.savefig(fname_out, dpi=300, bbox_inches="tight")
377 |
378 | ## This doesn't appear to get used anywhere from the main doeval function, can call this individually?
379 | def plot_3dmap_rmse(df, params, fname_out):
380 | """
381 | Plots RMSE value as function of two differnt X variates for each pairwise combination of factors
382 | The plot is using a gridded heatmap which enablesto visualise also categorical factors
383 | and not just numerical data
384 |
385 | INPUT
386 | df: pandas dataframe
387 | params: list of factor names
388 | target_name:
389 | dfname_out: output path + filename for image
390 |
391 | OUTPUT
392 | Cornerplot of Y-PairwiseCorrelation Images
393 | """
394 | print('Plotting RMSE as function of pairwise covariates ...')
395 | nfac = len(params)
396 | # Check first for max and min value
397 | ymin0 = df["RMSE"].max()
398 | ymax0 = df["RMSE"].min()
399 | for i in range(nfac - 1):
400 | for j in range(i + 1, nfac):
401 | table = pd.pivot_table(
402 | df,
403 | values="RMSE",
404 | index=[params[j]],
405 | columns=[params[i]],
406 | aggfunc=np.nanmean,
407 | )
408 | if np.min(table.min()) < ymin0:
409 | ymin0 = np.min(table.min())
410 | if np.max(table.max()) > ymax0:
411 | ymax0 = np.max(table.max())
412 | # Make corner plot
413 | # sns.set_style("whitegrid")
414 | plt.ioff() # automatic disables display of figures
415 | # fig, axs = plt.subplots(nfac-1, nfac-1, sharex=True, sharey=True, figsize=(nfac*2, nfac*2))
416 | fig, axs = plt.subplots(nfac - 1, nfac - 1, figsize=(nfac * 2, nfac * 2))
417 | for i in range(nfac - 1):
418 | for j in range(i + 1, nfac):
419 | table = pd.pivot_table(
420 | df,
421 | values="RMSE",
422 | index=[params[j]],
423 | columns=[params[i]],
424 | aggfunc=np.nanmean,
425 | )
426 | g = sns.heatmap(
427 | table,
428 | cmap="viridis",
429 | annot=False,
430 | ax=axs[j - 1, i],
431 | vmin=ymin0,
432 | vmax=ymax0,
433 | square=True,
434 | cbar=False,
435 | )
436 | if i > 0:
437 | g.set_ylabel("")
438 | g.set(yticklabels=[])
439 | if j < nfac - 1:
440 | g.set_xlabel("")
441 | g.set(xticklabels=[])
442 | # remove remaining plots:
443 | for i in range(1, nfac - 1):
444 | for j in range(1, i + 1):
445 | g = sns.heatmap(
446 | table * np.nan,
447 | cmap="viridis",
448 | annot=False,
449 | ax=axs[j - 1, i],
450 | vmin=ymin0,
451 | vmax=ymax0,
452 | square=True,
453 | cbar=False,
454 | )
455 | g.set_ylabel("")
456 | g.set(yticklabels=[])
457 | g.set_xlabel("")
458 | g.set(xticklabels=[])
459 | # Make colorbar
460 | g = sns.heatmap(
461 | table * np.nan,
462 | cmap="viridis",
463 | annot=False,
464 | ax=axs[0, 1],
465 | vmin=ymin0,
466 | vmax=ymax0,
467 | square=True,
468 | cbar=True,
469 | )
470 | g.set_xlabel("")
471 | g.set_ylabel("")
472 | g.set(yticklabels=[])
473 | g.set(xticklabels=[])
474 | fig.suptitle("Pair-Variate Plot for RMSE Function")
475 | plt.savefig(fname_out, dpi=300)
476 |
477 |
478 | def plot_regression(df, params, target_name, fname_out):
479 | """
480 | Creates Correlation plot with Y or RMSE for each numeric Variate
481 | Note that only numeric data is selected for this plot
482 |
483 | INPUT
484 | df: dataframe
485 | params: list of factor names
486 | target_name: 'Y Exp Mean' or 'RMSE'
487 |
488 | OUTPUT
489 | Image with Correlations
490 | """
491 | # Select numeric variates:
492 | columns = df[params]._get_numeric_data().columns
493 | nfac = len(columns)
494 | nax1 = int(np.sqrt(nfac))
495 | nax2 = int(np.ceil(nfac / int(np.sqrt(nfac))))
496 | # fig, axs = plt.subplots(nax1, nax2, figsize=(nax1 * 3, nax2 * 3))
497 | plt.ioff() # automatic disables display of figures
498 | fig = plt.figure(figsize=(nax1 * 5, nax2 * 4))
499 | for i in range(nfac):
500 | r = df[columns[i]].corr(df[target_name])
501 | plt.subplot(nax2, nax1, i + 1)
502 | # sns.lmplot( x = columns[0], y = 'Y Exp Mean', data = df)
503 | ax = sns.regplot(x=columns[i], y=target_name, data=df)
504 | ax.annotate("r = {:.3f}".format(r), xy=(0.1, 0.9), xycoords=ax.transAxes)
505 | plt.savefig(fname_out, dpi=300)
506 |
507 | def plot_factordis(df, params, target_name, fname_out):
508 | """
509 | Creates distribution plot of Y or RMSE for each numeric Variate
510 | Note that only numeric data is selected for this plot
511 |
512 | INPUT
513 | df: dataframe
514 | params: list of factor names
515 | target_name: 'Y Exp Mean' or 'RMSE'
516 |
517 | OUTPUT
518 | Image with Correlations
519 | """
520 | # Select numeric variates:
521 | columns = df[params]._get_numeric_data().columns
522 | nfac = len(columns)
523 | nax1 = int(np.sqrt(nfac))
524 | nax2 = int(np.ceil(nfac / int(np.sqrt(nfac))))
525 | # fig, axs = plt.subplots(nax1, nax2, figsize=(nax1 * 3, nax2 * 3))
526 | plt.ioff() # automatic disables display of figures
527 | fig = plt.figure(figsize=(nax1 * 5, nax2 * 4))
528 | for i in range(nfac):
529 | plt.subplot(nax2, nax1, i + 1)
530 | ax = sns.violinplot(y=df[target_name], x=df[columns[i]])
531 | plt.savefig(fname_out, dpi=300)
532 |
533 | def plot_table(df_table, outpath, fname_out):
534 | """
535 | Plot Dataframe as formatted table
536 |
537 | INPUT
538 | df_table: dataframe
539 | outpath: output path
540 | fname_out: image output filename
541 | """
542 | # Format table
543 |
544 | #df_table.to_string(float_format=lambda x: '%.3f' % x)
545 | plt.ioff()
546 | plt.figure(linewidth=2,
547 | tight_layout={'pad':40},
548 | figsize=(7,4)
549 | )
550 | # Set colors for row and column headers
551 | rcolors = plt.cm.BuPu(np.full(len(df_table), 0.15))
552 | ccolors = plt.cm.BuPu(np.full(len(list(df_table)), 0.15))
553 |
554 | # Hide axes
555 | ax = plt.gca()
556 | ax.get_xaxis().set_visible(False)
557 | ax.get_yaxis().set_visible(False)
558 | table = pd.plotting.table(ax, df_table, loc = 'center',
559 | rowLoc='left',
560 | colLoc = 'center',
561 | rowColours=rcolors,
562 | colColours=ccolors)
563 | table.scale(0.6, 1.3)
564 | table.set_fontsize(7)
565 | plt.box(on=None)
566 | # plt.tight_layout(pad = 40)
567 | plt.draw()
568 | plt.savefig(os.path.join(outpath, fname_out), dpi=300)
569 | plt.close()
570 |
571 |
572 | def main(inpath, fname_results, fname_design, outpath = None):
573 |
574 | if outpath is None:
575 | outpath = inpath = Path(inpath)
576 | else:
577 | outpath = Path(outpath)
578 | os.makedirs(outpath, exist_ok = True)
579 | # 1) Read in experiment result data
580 | if fname_results.endswith('.xlsx'):
581 | dfres = pd.read_excel(os.path.join(inpath, fname_results))
582 | elif fname_results.endswith('.csv'):
583 | dfres = pd.read_csv(os.path.join(inpath, fname_results))
584 | # ['Nexp' 'PID', 'Y Label', 'Y Exp', 'Y Truth', 'Std Y Exp', 'Std Y Truth', 'Weight PID']
585 | # 2) Read in experiment design setup table with parameter specifications
586 | dfdes = pd.read_csv(os.path.join(inpath, fname_design))
587 | # dfdes = pd.read_csv('designs_Danial/' + 'designtable_Nrun36.csv' )
588 |
589 | # Filter out design parameters that are constant
590 | dfdes = dfdes[dfdes.columns[dfdes.nunique() > 1]].copy()
591 |
592 | # List of different predictable Y properties:
593 | try:
594 | ylabels = dfres["Y Label"].unique()
595 | except:
596 | print("No column with name 'Y Label' found in results file. Default results name 'Y1' added.")
597 | dfres["Y Label"] = 'Y1'
598 | ylabels = dfres["Y Label"].unique()
599 | params = list(dfdes)[1:]
600 | npar = len(params)
601 | nexp = dfdes.shape[0]
602 |
603 | # Calculating main stats (RMSE, parameter importance, best parameters)
604 | calc_expresults_stats(ylabels, dfdes, dfres, outpath)
605 |
606 | # Visualise correlation results for each Y predictable
607 | for ylabel in ylabels:
608 | print("Plotting correlation plots for Ylabel:" + str(ylabel) + " ...")
609 | dfname = os.path.join(outpath, "Experiment_" + str(ylabel) + "_RMSE.csv")
610 | df = pd.read_csv(dfname)
611 | # Plot Pairwise X correlation for Y:
612 | fname_out1 = (os.path.join(
613 | outpath, "Y-pairwise-correlation_" + str(ylabel) + ".png")
614 | )
615 | plot_3dmap(df, params, "Y Exp Mean", fname_out1)
616 | # Plot Pairwise X correlation for RMSE
617 | fname_out2 = (os.path.join(
618 | outpath, "RMSE-pairwise-correlation_" + str(ylabel) + ".png")
619 | )
620 | plot_3dmap(df, params, "RMSE", fname_out2)
621 | # Plot Main factor correlation plot with Y:
622 | fname_out3 = os.path.join(outpath, "Expresult_correlation_X-Y_" + str(ylabel) + ".png")
623 |
624 | plot_regression(df, params, 'Y Exp Mean', fname_out3)
625 | fname_out4 = os.path.join(outpath, "Expresult_distribution_X-RMSE_" + str(ylabel) + ".png")
626 |
627 | plot_factordis(df, params, 'RMSE', fname_out4)
628 |
629 | print("FINISHED")
630 |
631 |
632 | def main_cli():
633 | ap = argparse.ArgumentParser()
634 | ap.add_argument('settings_path', nargs='?', default='settings_expresults.yaml')
635 | args = ap.parse_args()
636 | print(f"using settings in: {args.settings_path!r}")
637 | with open(args.settings_path) as f:
638 | cfg = yaml.safe_load(f)
639 | main(**cfg)
640 |
641 |
642 | if __name__ == "__main__":
643 | #from doegen import configloader_results as cfg
644 | main_cli()
--------------------------------------------------------------------------------
/doegen/init_config.py:
--------------------------------------------------------------------------------
1 | """
2 | Run this initialisation of .yaml and .xlsx files after installation of doegen.
3 |
4 | Creates config yaml files and excel templates if not already existing in current working directory.
5 | """
6 | import os
7 | import sys
8 | import shutil
9 | import doegen
10 |
11 | #Current working directory
12 | cwd = os.getcwd()
13 |
14 | #Directory where package doegen is installed:
15 | pckd = doegen.__path__[0]
16 |
17 | # Create settings_design.yaml
18 | if os.path.isfile('settings_design.yaml'):
19 | print('File settings_design.yaml already exists.')
20 | else:
21 | source = os.path.join(pckd,'settings_design.yaml')
22 | target = os.path.join(cwd,'settings_design.yaml')
23 | try:
24 | shutil.copy(source, target)
25 | print('Please edit settings_design.yaml')
26 | except IOError as e:
27 | print("Unable to copy file. %s" % e)
28 | except:
29 | print("Unexpected error:", sys.exc_info())
30 |
31 | # Create Experiment_setup.xlsx
32 | if os.path.isfile('Experiment_setup.xlsx'):
33 | print('File Experiment_setup.xlsx already exist.')
34 | else:
35 | source = os.path.join(pckd,'Experiment_setup.xlsx')
36 | target = os.path.join(cwd,'Experiment_setup.xlsx')
37 | try:
38 | shutil.copy(source, target)
39 | print('Please add your experiment settings in Experiment_setup.xlsx')
40 | except IOError as e:
41 | print("Unable to copy file. %s" % e)
42 | except:
43 | print("Unexpected error:", sys.exc_info())
44 |
45 |
46 | # Create settings_expresults.yaml
47 | if os.path.isfile('settings_expresults.yaml'):
48 | print('File settings_expresults.yaml already exists.')
49 | else:
50 | source = os.path.join(pckd,'settings_expresults.yaml')
51 | target = os.path.join(cwd,'settings_expresults.yaml')
52 | try:
53 | shutil.copy(source, target)
54 | print('Please edit settings_expresults.yaml after running the experiment.')
55 | except IOError as e:
56 | print("Unable to copy file. %s" % e)
57 | except:
58 | print("Unexpected error:", sys.exc_info())
59 |
60 | # Create Experiment_results.xlsx
61 | if os.path.isfile('Experiment_results.xlsx'):
62 | print('File Experiment_results.xlsx already exist.')
63 | else:
64 | source = os.path.join(pckd,'Experiment_results.xlsx')
65 | target = os.path.join(cwd,'Experiment_results.xlsx')
66 | try:
67 | shutil.copy(source, target)
68 | print('Please add your experiment results in Experiment_results.xlsx after running the experiment.')
69 | except IOError as e:
70 | print("Unable to copy file. %s" % e)
71 | except:
72 | print("Unexpected error:", sys.exc_info())
--------------------------------------------------------------------------------
/doegen/init_tests.py:
--------------------------------------------------------------------------------
1 | """
2 | Run this initialisation of .yaml and .xlsx files after installation of doegen.
3 |
4 | Creates config yaml files and excel templates if not already existing in current working directory.
5 | """
6 | import os
7 | import sys
8 | import shutil
9 | import doegen
10 |
11 | #Current working directory
12 | cwd = os.getcwd()
13 |
14 | #Directory where package doegen is installed:
15 | pckd = doegen.__path__[0]
16 |
17 | # Create settings_design.yaml
18 | if os.path.exists('test'):
19 | print("directory already exist: test")
20 | else:
21 | source = os.path.join(pckd,'test')
22 | target = os.path.join(cwd,'test')
23 | try:
24 | shutil.copytree(source, target)
25 | print("Generated directory: test")
26 | except IOError as e:
27 | print("Unable to copy directory. %s" % e)
28 | except:
29 | print("Unexpected error:", sys.exc_info())
--------------------------------------------------------------------------------
/doegen/settings_design.yaml:
--------------------------------------------------------------------------------
1 | # Settings for Experiment Design Generation
2 |
3 | # Path to exp design setup file
4 | path: 'test/'
5 | # Set path for output files. If empty (''), output folder will be same as above for setup file
6 | outpath: 'test/design_results/'
7 | # Filename for exp setup file
8 | fname_setup: 'Experiment_setup.xlsx'
9 | # Maximum time of experiment runs:
10 | nrun_max: 150
11 |
12 | # Set maximum time for optimization per run (in seconds; recommended to set to at least ~100s)
13 | maxtime_per_run: 100
14 |
15 | # Set maximal stepsize of run size interval, so tyhat noty each runszie has teo be optimized
16 | # The larger the interval, the faster the total computation
17 | # by default (select delta_nrun = None) it will select automatically the interval step with the lowest common multiple of the levels
18 | # (e.g. if mix between level 2 and 3 thus will results in delta_nrun = 6)
19 | delta_nrun: None
--------------------------------------------------------------------------------
/doegen/settings_expresults.yaml:
--------------------------------------------------------------------------------
1 | # Settings for Experiment Design Generation
2 |
3 | # Path to exp design setup file
4 | inpath: 'test/design_results/'
5 | # Set path for output files. Ff '', output folder will be same as above for setup file
6 | outpath: 'test/evaluation_results/'
7 | # Filename for exp design table in inpath
8 | fname_design: 'Designtable_optimal_Nrun72.csv'
9 | # Filename for exp results in inpath:
10 | fname_results: 'experiment_results_Nrun72.xlsx'
--------------------------------------------------------------------------------
/doegen/test/Experiment_setup_test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/test/Experiment_setup_test.xlsx
--------------------------------------------------------------------------------
/doegen/test/results/Designtable_best_Nrun18.csv:
--------------------------------------------------------------------------------
1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8
2 | 1,0.0,-10.0,1,1.0,L1,0,0,L1
3 | 2,3.0,-3.0,3,1.0,L2,0,0,L1
4 | 3,3.0,4.0,1,3.0,L2,1,1,L1
5 | 4,3.0,-10.0,3,5.0,L2,2,0,L1
6 | 5,6.0,4.0,3,3.0,L3,0,1,L1
7 | 6,6.0,-3.0,3,5.0,L1,0,0,L2
8 | 7,3.0,-3.0,1,3.0,L1,0,0,L2
9 | 8,3.0,-3.0,5,5.0,L1,1,1,L1
10 | 9,6.0,-10.0,1,5.0,L3,1,1,L1
11 | 10,0.0,4.0,5,5.0,L2,0,1,L2
12 | 11,6.0,-3.0,1,1.0,L2,2,1,L2
13 | 12,3.0,4.0,1,5.0,L3,2,0,L2
14 | 13,6.0,-10.0,5,3.0,L2,1,0,L2
15 | 14,0.0,-3.0,5,3.0,L3,2,0,L1
16 | 15,0.0,-10.0,3,3.0,L1,2,1,L2
17 | 16,6.0,4.0,5,1.0,L1,2,0,L1
18 | 17,0.0,4.0,3,1.0,L3,1,0,L2
19 | 18,3.0,-10.0,5,1.0,L3,0,1,L2
20 |
--------------------------------------------------------------------------------
/doegen/test/results/Designtable_minimum_Nrun30.csv:
--------------------------------------------------------------------------------
1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8
2 | 1,0.0,-10.0,1,1.0,L1,0,0,L1
3 | 2,0.0,-3.0,3,1.0,L2,1,0,L1
4 | 3,3.0,-10.0,1,3.0,L2,0,0,L2
5 | 4,6.0,4.0,5,5.0,L2,2,0,L2
6 | 5,3.0,-10.0,5,5.0,L3,1,0,L1
7 | 6,0.0,-10.0,5,3.0,L1,2,0,L2
8 | 7,3.0,4.0,1,3.0,L1,1,1,L1
9 | 8,3.0,-10.0,1,3.0,L2,2,1,L1
10 | 9,3.0,-3.0,5,1.0,L1,0,0,L2
11 | 10,6.0,-10.0,1,1.0,L2,1,1,L2
12 | 11,3.0,-10.0,3,5.0,L3,1,0,L2
13 | 12,6.0,4.0,1,5.0,L3,0,0,L1
14 | 13,0.0,-3.0,3,5.0,L2,0,1,L1
15 | 14,6.0,-10.0,3,1.0,L3,0,1,L2
16 | 15,3.0,4.0,3,1.0,L1,2,0,L2
17 | 16,6.0,4.0,3,3.0,L1,1,1,L1
18 | 17,3.0,4.0,5,1.0,L2,0,1,L1
19 | 18,6.0,-10.0,5,1.0,L3,2,1,L1
20 | 19,0.0,-10.0,5,5.0,L1,1,1,L1
21 | 20,3.0,-3.0,3,3.0,L3,1,1,L2
22 | 21,3.0,-3.0,5,1.0,L3,2,1,L1
23 | 22,0.0,4.0,5,3.0,L3,0,1,L2
24 | 23,3.0,-3.0,1,5.0,L1,0,1,L2
25 | 24,6.0,-3.0,1,5.0,L1,2,1,L2
26 | 25,0.0,-10.0,3,5.0,L2,2,1,L2
27 | 26,0.0,4.0,1,1.0,L3,1,0,L2
28 | 27,6.0,-3.0,5,3.0,L2,1,0,L2
29 | 28,3.0,4.0,3,5.0,L2,2,0,L1
30 | 29,0.0,-3.0,1,3.0,L3,2,0,L1
31 | 30,6.0,-10.0,3,3.0,L1,0,0,L1
32 |
--------------------------------------------------------------------------------
/doegen/test/results/Designtable_optimal_Nrun72.csv:
--------------------------------------------------------------------------------
1 | Nexp,Factor 1,Factor 2,Factor 3,Factor 4,Factor 5,Factor 6,Factor 7,Factor 8
2 | 1,0.0,-10.0,1,1.0,L1,0,0,L1
3 | 2,3.0,-3.0,3,3.0,L1,1,1,L2
4 | 3,0.0,4.0,5,5.0,L1,2,0,L1
5 | 4,3.0,-3.0,3,3.0,L2,2,1,L1
6 | 5,6.0,-3.0,5,1.0,L3,0,1,L2
7 | 6,0.0,4.0,3,5.0,L3,1,1,L1
8 | 7,6.0,-10.0,3,3.0,L3,1,1,L1
9 | 8,0.0,-3.0,1,1.0,L1,2,1,L2
10 | 9,3.0,4.0,5,5.0,L2,1,1,L2
11 | 10,6.0,-3.0,1,3.0,L2,1,0,L1
12 | 11,3.0,-10.0,3,5.0,L1,1,0,L2
13 | 12,0.0,-10.0,3,1.0,L2,2,1,L2
14 | 13,0.0,4.0,3,5.0,L2,1,0,L1
15 | 14,3.0,-10.0,1,5.0,L2,2,1,L1
16 | 15,0.0,-10.0,3,3.0,L1,1,0,L2
17 | 16,6.0,-10.0,5,5.0,L2,0,0,L2
18 | 17,6.0,-10.0,3,5.0,L3,0,1,L2
19 | 18,6.0,4.0,1,3.0,L3,2,1,L2
20 | 19,0.0,-3.0,3,3.0,L3,2,1,L1
21 | 20,0.0,-3.0,3,5.0,L1,0,0,L1
22 | 21,3.0,-10.0,5,1.0,L3,2,0,L1
23 | 22,3.0,-10.0,5,3.0,L2,1,1,L1
24 | 23,6.0,-3.0,3,5.0,L2,0,0,L1
25 | 24,3.0,4.0,3,1.0,L3,2,0,L2
26 | 25,0.0,-10.0,1,3.0,L3,2,0,L2
27 | 26,0.0,4.0,3,5.0,L3,2,0,L2
28 | 27,6.0,4.0,1,1.0,L2,1,0,L2
29 | 28,3.0,-3.0,5,1.0,L3,2,0,L1
30 | 29,3.0,4.0,3,3.0,L1,2,0,L2
31 | 30,0.0,-3.0,5,5.0,L3,0,1,L2
32 | 31,6.0,4.0,3,5.0,L1,2,0,L1
33 | 32,6.0,-10.0,5,3.0,L1,2,1,L1
34 | 33,3.0,-3.0,3,3.0,L2,0,0,L2
35 | 34,3.0,-3.0,1,5.0,L3,0,1,L2
36 | 35,0.0,-10.0,1,5.0,L2,1,0,L2
37 | 36,0.0,-3.0,5,1.0,L2,1,0,L1
38 | 37,0.0,4.0,1,1.0,L2,2,1,L1
39 | 38,6.0,-3.0,5,1.0,L1,1,0,L2
40 | 39,0.0,-10.0,5,3.0,L3,0,1,L1
41 | 40,0.0,4.0,5,3.0,L2,0,0,L1
42 | 41,3.0,4.0,3,1.0,L3,1,1,L1
43 | 42,0.0,-10.0,1,1.0,L3,0,0,L1
44 | 43,0.0,4.0,5,3.0,L2,0,1,L1
45 | 44,3.0,-3.0,5,5.0,L3,1,0,L1
46 | 45,6.0,4.0,5,1.0,L1,1,1,L1
47 | 46,6.0,-10.0,1,3.0,L3,1,0,L1
48 | 47,6.0,4.0,1,3.0,L3,2,0,L1
49 | 48,0.0,-10.0,3,1.0,L1,1,1,L2
50 | 49,6.0,-3.0,3,3.0,L2,2,1,L2
51 | 50,6.0,-3.0,5,5.0,L2,2,1,L1
52 | 51,0.0,-3.0,1,5.0,L3,1,1,L2
53 | 52,6.0,4.0,3,1.0,L2,0,1,L2
54 | 53,6.0,-10.0,3,1.0,L1,0,1,L2
55 | 54,6.0,-10.0,1,5.0,L1,1,1,L1
56 | 55,3.0,-10.0,5,5.0,L3,0,0,L2
57 | 56,0.0,-3.0,5,3.0,L1,2,0,L2
58 | 57,3.0,-3.0,1,3.0,L1,0,1,L1
59 | 58,3.0,-10.0,3,1.0,L2,0,0,L1
60 | 59,6.0,4.0,5,3.0,L3,1,0,L2
61 | 60,3.0,4.0,5,1.0,L3,1,1,L2
62 | 61,3.0,-3.0,1,1.0,L1,1,0,L1
63 | 62,3.0,-10.0,5,5.0,L1,2,1,L1
64 | 63,6.0,-3.0,3,1.0,L3,0,0,L1
65 | 64,0.0,4.0,5,3.0,L1,0,1,L2
66 | 65,6.0,4.0,1,5.0,L1,0,1,L1
67 | 66,0.0,-3.0,1,1.0,L2,1,1,L2
68 | 67,3.0,4.0,1,3.0,L1,0,0,L2
69 | 68,3.0,4.0,1,3.0,L2,0,0,L2
70 | 69,6.0,-10.0,5,1.0,L2,2,0,L2
71 | 70,3.0,-10.0,1,5.0,L2,2,1,L2
72 | 71,6.0,-3.0,1,5.0,L1,2,0,L2
73 | 72,3.0,4.0,1,1.0,L1,0,1,L1
74 |
--------------------------------------------------------------------------------
/doegen/test/results/experiment_results_Nrun72.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/doegen/test/results/experiment_results_Nrun72.xlsx
--------------------------------------------------------------------------------
/doegen/test/settings_design_test.yaml:
--------------------------------------------------------------------------------
1 | # Settings for Experiment Design Generation
2 |
3 | # Path to exp design setup file
4 | path: 'test/'
5 | # Set path for output files. If empty (''), output folder will be same as above for setup file
6 | outpath: 'test/results_test/'
7 | # Filename for exp setup file
8 | fname_setup: 'Experiment_setup_test.xlsx'
9 | # Maximum number of experimental runs:
10 | nrun_max: 80
11 |
12 | # Set maximum time for optimization per run (in seconds, recommended to set to at least ~100s)
13 | maxtime_per_run: 80
14 |
15 | # Set maximal stepsize of run size interval, so tyhat noty each runszie has teo be optimized
16 | # The larger the interval, the faster the total computation
17 | # by default (select delta_nrun = None) it will select automatically the interval step with the lowest common multiple of the levels
18 | # (e.g. if mix between level 2 and 3 thus will results in delta_nrun = 6)
19 | delta_nrun: None
--------------------------------------------------------------------------------
/doegen/test/settings_expresults_test.yaml:
--------------------------------------------------------------------------------
1 | # Settings for Experiment Design Generation
2 |
3 | # Path to exp design setup file
4 | inpath: 'test/results/'
5 | # Set path for output files. Ff '', output folder will be same as above for setup file
6 | outpath: 'test/expresults2/'
7 | # Filename for exp design table in inpath
8 | fname_design: 'Designtable_optimal_Nrun72.csv'
9 | # Filename for exp results in inpath:
10 | fname_results: 'experiment_results_Nrun72.xlsx'
--------------------------------------------------------------------------------
/figures/BestFactor_Avg1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/BestFactor_Avg1.png
--------------------------------------------------------------------------------
/figures/Designtable_optimal_Nrun72.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Designtable_optimal_Nrun72.png
--------------------------------------------------------------------------------
/figures/Efficiencies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Efficiencies.png
--------------------------------------------------------------------------------
/figures/Efficiencies_[3, 3, 3, 3, 3, 3, 2, 2].png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Efficiencies_[3, 3, 3, 3, 3, 3, 2, 2].png
--------------------------------------------------------------------------------
/figures/Experiment_result_Nrun72_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Experiment_result_Nrun72_header.png
--------------------------------------------------------------------------------
/figures/Expresult_correlation_X_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Expresult_correlation_X_1.png
--------------------------------------------------------------------------------
/figures/Expresult_pairwise-correlation_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Expresult_pairwise-correlation_1.png
--------------------------------------------------------------------------------
/figures/Result_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Result_header.png
--------------------------------------------------------------------------------
/figures/Results_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Results_overview.png
--------------------------------------------------------------------------------
/figures/Setup_header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Setup_header.png
--------------------------------------------------------------------------------
/figures/Setup_header_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Setup_header_test.png
--------------------------------------------------------------------------------
/figures/Top10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Top10.png
--------------------------------------------------------------------------------
/figures/Ybarplot_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/Ybarplot_1.png
--------------------------------------------------------------------------------
/figures/pairwise_correlation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sebhaan/DoEgen/3acce346928ee68917484a13bfa57b805299a9d5/figures/pairwise_correlation.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | xlrd==1.2.0
2 | matplotlib==3.3.4
3 | pandas>=1.2.3
4 | XlsxWriter==1.2.8
5 | OApackage>=2.7.7
6 | numpy>=1.22.0
7 | seaborn==0.11.1
8 | PyYAML==5.4.1
9 | scikit_learn==0.24.1
10 | openpyxl==3.0.7
11 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | try:
2 | from setuptools import setup, find_packages
3 | except ImportError:
4 | from distutils.core import setup
5 | from os import path
6 | import os
7 | import subprocess
8 | import io
9 |
10 | ## in development set version
11 | PYPI_VERSION = '0.5.0'
12 |
13 | # Return the git revision as a string (from numpy)
14 | def git_version():
15 | def _minimal_ext_cmd(cmd):
16 | # construct minimal environment
17 | env = {}
18 | for k in ['SYSTEMROOT', 'PATH']:
19 | v = os.environ.get(k)
20 | if v is not None:
21 | env[k] = v
22 | # LANGUAGE is used on win32
23 | env['LANGUAGE'] = 'C'
24 | env['LANG'] = 'C'
25 | env['LC_ALL'] = 'C'
26 | out = subprocess.Popen(cmd, stdout = subprocess.PIPE, env=env).communicate()[0]
27 | return out
28 |
29 | try:
30 | out = _minimal_ext_cmd(['git', 'rev-parse', '--short', 'HEAD'])
31 | GIT_REVISION = out.strip().decode('ascii')
32 | except OSError:
33 | GIT_REVISION = "Unknown"
34 |
35 | return GIT_REVISION
36 |
37 |
38 | if PYPI_VERSION is None:
39 | PYPI_VERSION = git_version()
40 |
41 |
42 | this_directory = path.abspath(path.dirname(__file__))
43 | with io.open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
44 | long_description = f.read()
45 |
46 | packages = find_packages()
47 |
48 | if __name__ == "__main__":
49 | setup(name = 'DoEgen',
50 | author = "Sebastian Haan",
51 | author_email = "sebastian.haan@sydney.edu.au",
52 | url = "https://github.com/sebhaan/DoEgen",
53 | version = PYPI_VERSION,
54 | description = "DoEgen: A Python Library for Optimised Design of Experiment Generation and Evaluation",
55 | long_description = long_description,
56 | long_description_content_type='text/markdown',
57 | install_requires = ['numpy>=1.16.3',
58 | 'xlrd==1.2.0',
59 | 'pandas>=1.0.3',
60 | 'XlsxWriter>=1.2.8',
61 | 'openpyxl>=3.0.7',
62 | 'seaborn>=0.11.1',
63 | 'OApackage>=2.7.11',
64 | 'tabulate>=0.8.9',
65 | 'matplotlib>=3.1.0',
66 | 'PyYAML>=5.3.1',
67 | 'scikit_learn>=0.22.2.post1'],
68 | python_requires = '>=3.6',
69 | setup_requires = ["pytest-runner", 'webdav'],
70 | tests_require = ["pytest", 'webdav'],
71 | packages = ['doegen'],
72 | package_data = {'doegen': ['*.yaml',
73 | '*.xlsx',
74 | 'test/Experiment_setup_test.xlsx',
75 | 'test/settings_design_test.yaml',
76 | 'test/settings_expresults_test.yaml',
77 | 'test/results/experiment_results_Nrun72.xlsx',
78 | 'test/results/Designtable_optimal_Nrun72.csv']},
79 | include_package_data = False,
80 | classifiers = ['Programming Language :: Python :: 3',
81 | 'Programming Language :: Python :: 3.6',
82 | 'Programming Language :: Python :: 3.7'
83 | ]
84 | )
--------------------------------------------------------------------------------