├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── MCTS_node.py
├── README.md
├── Rollout_policies.py
├── Tree.py
├── UCT_policies.py
├── biological_scoring.py
├── calculate_organisms.py
├── calculate_rule_sets_similarity.py
├── change_config.py
├── chemical_compounds_state.py
├── chemical_scoring.py
├── chemistry_choices.md
├── compound.py
├── compound_scoring.py
├── config.py
├── convert_to_SBML.py
├── data
├── base_config.py
├── compounds_to_add
│ └── TPA_to_add.csv
├── golden_dataset.csv
├── name_structure_toxicity.csv
├── sinks
│ ├── bsubtilis_iYO844_sink_reduced_rp_ready.csv
│ ├── detectable_metabolites_uncommented.csv
│ ├── ecoli_core_sink_reduced_rp_ready.csv
│ ├── ecoli_iJO1366_sink_reduced_rp_ready.csv
│ └── ecoli_iML1515_sink_reduced_rp_ready.csv
└── supplement_finder
│ ├── data
│ └── metanetx_extracted_inchikeys.json.tar.gz
│ └── tree_for_testing
│ ├── TPA
│ └── pickles
│ │ └── tree_end_search.pkl.tar.gz
│ └── morphine
│ └── pickles
│ └── tree_end_search.pkl.tar.gz
├── document_all_options.md
├── expected_results
├── deoxiviolacein_1.json
├── deoxiviolacein_2.json
├── deoxiviolacein_3.json
├── deoxiviolacein_4.json
├── deoxiviolacein_best.json
├── deoxiviolacein_full_scope.json
├── deoxiviolacein_full_tree_for_MCTS.json
├── deoxiviolacein_iteration_12.json
├── deoxiviolacein_iteration_15.json
├── deoxiviolacein_iteration_82.json
├── deoxiviolacein_iteration_85.json
├── pickles
│ └── tree_end_search.pkl.tar.gz
├── results.csv
└── tree.log
├── move.py
├── organisms.py
├── pathway.py
├── pathway_scoring.py
├── pyproject.toml
├── representation.py
├── rewarding.py
├── rule_sets_examples.py
├── rule_sets_similarity.py
├── setup.py
├── supplement_finder.py
├── tests
├── data
│ ├── rules_mixed_subset.tsv
│ ├── rules_r10_subset.tsv
│ ├── rules_r2_subset.tsv
│ ├── state_BOPG_BSAB_GPRL.pkl
│ └── tree_pipecolate_test.pkl
├── generated_jsons
│ ├── .gitignore
│ └── .gitkeep
├── test_Filters.py
├── test_MCTS_node.py
├── test_Standardizer.py
├── test_Tree.py
├── test_Utils.py
├── test_cli.py
├── test_compound.py
├── test_moves.py
├── test_state.py
└── tree_test.pkl
├── tox.ini
├── tree_viewer.py
└── utilities
├── chemtools
├── Filters.py
├── Sequences.py
├── Standardizer.py
└── Utils.py
└── reactor
├── Core.py
├── Utils.py
└── cli.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Usual stuff
2 | .DS_Store
3 | __pycache__
4 | *.egg-info
5 |
6 | # Data
7 | data/*/*.log
8 | data/*/*.pkl
9 |
10 | # Test data
11 | tests/generated_jsons/pipecolate_iteration_0.json
12 |
13 | # IDE
14 | .vscode
15 | .idea
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## Unreleased
2 |
3 | ### Feat
4 |
5 | - enables execution without fire timeout
6 | - **Tree**: refine debug logging
7 |
8 | ### Fix
9 |
10 | - **compound**: use standardisation timeout ti new compounds
11 | - **Tree**: timeout arguments as int
12 | - further restrict rdkit version (reproducibility issue #21)
13 |
14 | ### Refactor
15 |
16 | - **Tree**: remove unused code
17 |
18 | ## 1.0.1 (2024-06-20)
19 |
20 | ### Fix
21 |
22 | - **DATA_PATH**: fix typo
23 | - **Tree**: import missing pre-parsed organisms
24 |
25 | ### Refactor
26 |
27 | - **Tree**: sweep imports
28 | - **calculate_organisms**: clean organism data files generation
29 | - **calculate_organisms**: remove unused imports
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Mathilde Koch, INRA
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include chemistry_choices.md
2 | include data/compounds_to_add/*
3 | include data/sinks/*
4 | include data/supplement_finder/*
5 | include data/golden_dataset.csv
6 | include data/name_structure_toxicity.csv
7 | include expected_results/*
8 | include tests/*
9 | include utilities/*
10 | include README.md
11 |
--------------------------------------------------------------------------------
/Rollout_policies.py:
--------------------------------------------------------------------------------
1 |
2 | """
3 | Defines the Rollout policies.
4 | Usage is : move = RolloutPolicy.select_best_move(available_moves)
5 | Remarks:
6 | - various policies have been tested on toy examples on a Jupyter notebook during implementation
7 | """
8 |
9 | from math import sqrt, log
10 | import random
11 |
12 | class Rollout_policy(object):
13 | """
14 | Defines rollout policy.
15 | From a list of moves, select the one that should be used for rollout.
16 | This is the base object, subclasses necessitate a policy function.
17 | """
18 | def __init__(self, policy_type, description = "Default Rollout Policy"):
19 | self.policy_type = policy_type
20 | self.description = description
21 |
22 | def select_best_move(self, available_moves):
23 | try:
24 | move = self.policy(available_moves)
25 | return(move)
26 | except IndexError:
27 | return(None)
28 |
29 | def __str__(self):
30 | return("Policy type: {} \nDescription: {}".format(self.policy_type, self.description))
31 |
32 | class Rollout_policy_first(Rollout_policy):
33 | """
34 | Defines rollout policy.
35 | Always returns the first element: first compound, first rule
36 | """
37 | def __init__(self):
38 | description = "Always select the first compound_rule combination"
39 | Rollout_policy.__init__(self, policy_type = "First found combination", description = description)
40 | self.name = "Rollout_policy_first"
41 | self.policy = self.policy()
42 |
43 | def policy(self):
44 | # CODE IT
45 | def select_best_inside(available_moves):
46 | move = available_moves[0]
47 | return(move)
48 | return(select_best_inside)
49 |
50 | class Rollout_policy_chemical_best(Rollout_policy):
51 | """
52 | Defines rollout policy.
53 | Always returns the best chemical move
54 | """
55 | def __init__(self):
56 | description = "Always select the move with the highest chemical score"
57 | Rollout_policy.__init__(self, policy_type = "Best Chemical", description = description)
58 | self.policy = self.best_chemical_policy()
59 | self.name = "Rollout_policy_chemical_best"
60 |
61 | def best_chemical_policy(self):
62 | # CODE IT
63 | def select_best_inside(available_moves):
64 | current_best = available_moves[0]
65 | current_best_score = current_best.chemical_score
66 | for element in available_moves:
67 | chemical_score = element.chemical_score
68 | if chemical_score > current_best_score:
69 | current_best_score = chemical_score
70 | current_best = element
71 | return(current_best)
72 | return(select_best_inside)
73 |
74 | class Rollout_policy_biological_best(Rollout_policy):
75 | """
76 | Defines rollout policy.
77 | Always returns the best biological move
78 | """
79 | def __init__(self):
80 | description = "Always select the move with the highest biological score"
81 | Rollout_policy.__init__(self, policy_type = "Best Biological", description = description)
82 | self.policy = self.best_biological_policy()
83 | self.name = "Rollout_policy_biological_best"
84 |
85 | def best_biological_policy(self):
86 | # CODE IT
87 | def select_best_inside(available_moves):
88 | current_best = available_moves[0]
89 | current_best_score = current_best.biological_score
90 | for element in available_moves:
91 | biological_score = current_best_score = element.biological_score
92 | if biological_score > current_best_score:
93 | current_best_score = biological_score
94 | current_best = element
95 | return(current_best)
96 | return(select_best_inside)
97 |
98 | class Rollout_policy_biochemical_addition_best(Rollout_policy):
99 | """
100 | Defines rollout policy.
101 | Always returns the best biochemical (addition of scores) move
102 | """
103 | def __init__(self):
104 | description = "Select the highest Biochemical addition score"
105 | Rollout_policy.__init__(self, policy_type = "Best Biochemical addition", description = description)
106 | self.policy = self.best_biochemical_policy()
107 | self.name = "Rollout_policy_biochemical_addition_best"
108 |
109 | def best_biochemical_policy(self):
110 | # CODE IT
111 | def select_best_inside(available_moves):
112 | current_best = available_moves[0]
113 | current_best_score = current_best.biological_score + current_best.chemical_score
114 | for element in available_moves:
115 | biological_score = element.biological_score
116 | chemical_score = element.chemical_score
117 | if biological_score + chemical_score > current_best_score:
118 | current_best_score = biological_score + chemical_score
119 | current_best = element
120 | return(current_best)
121 | return(select_best_inside)
122 |
123 | class Rollout_policy_biochemical_multiplication_best(Rollout_policy):
124 | """
125 | Defines rollout policy.
126 | Always returns the best biochemical (multiplication of scores) move
127 | """
128 | def __init__(self):
129 | description = "Select the highest Biochemical multiplication score"
130 | Rollout_policy.__init__(self, policy_type = "Best Biochemical multiplication", description = description)
131 | self.policy = self.best_biochemical_policy()
132 | self.name = "Rollout_policy_biochemical_multiplication_best"
133 |
134 | def best_biochemical_policy(self):
135 | # CODE IT
136 | def select_best_inside(available_moves):
137 | current_best = available_moves[0]
138 | current_best_score = current_best.biological_score * current_best.chemical_score
139 | for element in available_moves:
140 | biological_score = element.biological_score
141 | chemical_score = element.chemical_score
142 | if biological_score * chemical_score > current_best_score:
143 | current_best_score = biological_score * chemical_score
144 | current_best = element
145 | return(current_best)
146 | return(select_best_inside)
147 |
148 | class Rollout_policy_random_uniform(Rollout_policy):
149 | """
150 | Random sampling of the move amongst available moves
151 | """
152 | def __init__(self):
153 | description = "Random selection - no scoring involved"
154 | Rollout_policy.__init__(self, policy_type = "Random sampling", description = description)
155 | self.policy = self.policy()
156 | self.name = "Rollout_policy_random_uniform"
157 |
158 | def policy(self):
159 | # CODE IT
160 | def select_best_inside(available_moves):
161 | index = random.randrange(0, len(available_moves))
162 | move = available_moves[index]
163 | return(move)
164 | return(select_best_inside)
165 |
166 | class Rollout_policy_random_uniform_on_chem_score(Rollout_policy):
167 | """
168 | Random sampling of the move amongst available moves, weighted by chemical score
169 | """
170 | def __init__(self):
171 | description = "Random selection - uniform sampling from chemical weights"
172 | Rollout_policy.__init__(self, policy_type = "Chemical uniform sampling", description = description)
173 | self.policy = self.policy()
174 | self.name = "Rollout_policy_random_uniform_on_chem_score"
175 |
176 | def policy(self):
177 | # CODE IT
178 | def select_best_inside(available_moves):
179 | pop, cum, cum_w = [], [], 0
180 | for move in available_moves:
181 | pop.append(move)
182 | cum_w = cum_w + move.chemical_score
183 | cum.append(cum_w)
184 | move = random.choices(pop, cum_weights=cum, k=1)[0]
185 | return(move)
186 | return(select_best_inside)
187 |
188 | class Rollout_policy_random_uniform_on_bio_score(Rollout_policy):
189 | """
190 | Random sampling of the move amongst available moves, weighted by biological score
191 | """
192 | def __init__(self):
193 | description = "Random selection - uniform sampling from biological weights"
194 | Rollout_policy.__init__(self, policy_type = "Biological uniform sampling", description = description)
195 | self.policy = self.policy()
196 | self.name = "Rollout_policy_random_uniform_on_bio_score"
197 | def policy(self):
198 | # CODE IT
199 | def select_best_inside(available_moves):
200 | pop, cum, cum_w = [], [], 0
201 |
202 | for move in available_moves:
203 | pop.append(move)
204 | cum_w = cum_w + move.biological_score
205 | cum.append(cum_w)
206 | move = random.choices(pop, cum_weights=cum, k=1)[0]
207 | return(move)
208 | return(select_best_inside)
209 |
210 | class Rollout_policy_random_uniform_on_biochemical_addition_score(Rollout_policy):
211 | """
212 | Random sampling of the move amongst available moves, weighted by biochemical (addition) score
213 | """
214 | def __init__(self):
215 | description = "Random selection - uniform sampling from added biochemical weights"
216 | Rollout_policy.__init__(self, policy_type = "Biochemical addition uniform sampling", description = description)
217 | self.policy = self.policy()
218 | self.name = "Rollout_policy_random_uniform_on_biochemical_addition_score"
219 |
220 | def policy(self):
221 | # CODE IT
222 | def select_best_inside(available_moves):
223 | pop, cum, cum_w = [], [], 0
224 |
225 | for move in available_moves:
226 | pop.append(move)
227 | cum_w = cum_w + move.biological_score + move.chemical_score
228 | cum.append(cum_w)
229 | move = random.choices(pop, cum_weights=cum, k=1)[0]
230 | return(move)
231 | return(select_best_inside)
232 |
233 | class Rollout_policy_random_uniform_on_biochemical_multiplication_score(Rollout_policy):
234 | """
235 | Random sampling of the move amongst available moves, weighted by biochemical (multiplication) score
236 | """
237 | def __init__(self):
238 | description = "Random selection - uniform sampling from multiplied biochemical weights"
239 | Rollout_policy.__init__(self, policy_type = "Biochemical uniform sampling", description = description)
240 | self.policy = self.policy()
241 | self.name = "Rollout_policy_random_uniform_on_biochemical_multiplication_score"
242 |
243 | def policy(self):
244 | # CODE IT
245 | def select_best_inside(available_moves):
246 | pop, cum, cum_w = [], [], 0
247 |
248 | for move in available_moves:
249 | pop.append(move)
250 | cum_w = cum_w + move.biological_score * move.chemical_score
251 | cum.append(cum_w)
252 | move = random.choices(pop, cum_weights=cum, k=1)[0]
253 | return(move)
254 | return(select_best_inside)
255 |
--------------------------------------------------------------------------------
/UCT_policies.py:
--------------------------------------------------------------------------------
1 | """
2 | Defines the UCT (Upper Confidence Tree) policies.
3 | It it the formula that allows for balancing between exploration and exploitation when selecting children in the Tree.
4 | Implements a number of different policies.
5 | Policies are Subclasses of UCT_policy Class.
6 | They need to have a attribute function that does the calculation. See examples if you want to develop your own.
7 | """
8 |
9 | from math import sqrt, log
10 |
11 |
12 | class UCT_policy(object):
13 | """
14 | Defines UCT_policies objects.
15 | They take a node and return the best child according to this policy.
16 | Only subclasses of this object can work as there is no default calculation function.
17 | """
18 | def __init__(self, parameters = {"UCTK": 2}, policy_type = 'Classical', function = None):
19 | self.parameters = parameters
20 | self.policy_type = policy_type
21 |
22 | def calculate(self, node, top_n = 1):
23 | s = sorted(node.children, key = lambda c: self.function(c, parent_visits = node.visits))
24 | s = s[-top_n]
25 | return s
26 |
27 | def __str__(self):
28 | return("Policy type: {} \nFormula: {}".format(self.policy_type, self.formula))
29 |
30 |
31 | class Classical_UCT(UCT_policy):
32 | """
33 | This class implements the most basic UCT functions.
34 | Only uses number of visits as a criteria.
35 | It is the Classical UCT formula where no additionnal expert knowledge is inputed.
36 | """
37 | def __init__(self, parameters = {"UCTK": 1000}):
38 | UCT_policy.__init__(self, policy_type = "Classical")
39 | self.parameters = parameters
40 | self.formula = "mean_score + sqrt({}*log(N + 1)/(n+1))".format(parameters["UCTK"])
41 | self.function = self.simple_UCT_formula(self.parameters)
42 |
43 | def simple_UCT_formula(self, parameters):
44 | UCTK = parameters["UCTK"]
45 | def simple_formula_inside(c, parent_visits):
46 | value = c.average_score + sqrt(UCTK*log(parent_visits +1)/(c.visits + 1))
47 | return(value)
48 | return(simple_formula_inside)
49 |
50 | class Classical_UCT_RAVE(UCT_policy):
51 | """
52 | This class implements UCT based on visit count and RAVE.
53 | RAVE stands for Rapid Action Value Estimation:
54 | - it adds another score based on usage of identical moves elsewhere in the Tree
55 | - this is ponderated by the number of visits: as visits increase, the actual score of the node becomes more important than this initila estimation.
56 | """
57 | def __init__(self, parameters = {"UCTK": 1000, "k_rave": 100}):
58 | UCT_policy.__init__(self, policy_type = "Classical_RAVE")
59 | self.parameters = parameters
60 | self.formula = "(1-b) mean_score + b rave_score + sqrt({}*log(N + 1)/(n+1)) with b = sqrt({}/(3N + {}))".format(parameters["UCTK"], parameters["k_rave"], parameters["k_rave"])
61 | self.function = self.RAVE_formula(parameters = self.parameters)
62 |
63 | def RAVE_formula(self, parameters):
64 | UCTK = parameters["UCTK"]
65 | k_rave = parameters["k_rave"]
66 | def simple_formula_inside(c, parent_visits):
67 | b = sqrt(k_rave/(3*parent_visits + k_rave))
68 | value = c.average_score *(1-b) + b * c.move.RAVE_average_score + sqrt(UCTK*log(parent_visits +1)/(c.visits + 1))
69 | return(value)
70 | return(simple_formula_inside)
71 |
72 | class Classical_UCT_with_bias(UCT_policy):
73 | """
74 | This class implements UCT based on visits and progressive bias.
75 | Progressive bias works by
76 | - giving an initial value to a node (based on expert knowledge for example)
77 | - this importance decreases as the node gets visited and this initial estimation's importance decreases in favor of actual rollouts.
78 | """
79 | def __init__(self, parameters = {"UCTK": 1000, "bias_k": 1}):
80 | UCT_policy.__init__(self, policy_type = "Classical")
81 | self.parameters = parameters
82 | self.formula = "mean_score + sqrt({}*log(N + 1)/(n+1)) + {} * progressive_bias/(n+1)".format(parameters["UCTK"], parameters["bias_k"])
83 | self.function = self.simple_UCT_formula(self.parameters)
84 |
85 | def simple_UCT_formula(self, parameters):
86 | UCTK = parameters["UCTK"]
87 | bias_k = parameters["bias_k"]
88 | def simple_formula_inside(c, parent_visits):
89 | value = c.average_score + sqrt(UCTK*log(parent_visits +1)/(c.visits + 1))+ bias_k * c.progressive_bias/(c.visits + 1)
90 | return(value)
91 | return(simple_formula_inside)
92 |
93 | class Nature_UCT(UCT_policy):
94 | """
95 | This class implements the formula used in the following Nature paper :(https://doi.org/10.1038/nature25978)
96 | Planning chemical syntheses with deep neural networks and symbolic AI
97 | It is identical to the Chemical Scoring UCT (Chemical_UCT_1)
98 | """
99 | def __init__(self, parameters = {"UCTK": 3}):
100 | UCT_policy.__init__(self, policy_type = "Nature Symbolic IA")
101 | self.parameters = parameters
102 | self.formula = "mean_score + {} * P * sqrt(N/(n+1))".format(parameters["UCTK"])
103 | self.function = self.Nature_UCT_formula(self.parameters)
104 |
105 | def Nature_UCT_formula(self, parameters):
106 | UCTK = parameters["UCTK"]
107 | def simple_formula_inside(c, parent_visits):
108 | chem_P = c.move.chemical_score
109 | value = c.average_score + UCTK * chem_P *sqrt(parent_visits/(c.visits + 1))
110 | return(value)
111 | return(simple_formula_inside)
112 |
113 | class Biochemical_UCT_1(UCT_policy):
114 | """
115 | This class implements a simple biochemical score UCT.
116 | The selection is guided by a product of chemical and biological score.
117 | """
118 | def __init__(self, parameters = {"UCTK": 3}):
119 | UCT_policy.__init__(self, policy_type = "Biochemical multiplication")
120 | self.parameters = parameters
121 | self.formula = "mean_score + {} * P_c * B * sqrt(N/(n+1))".format(parameters["UCTK"])
122 | self.function = self.Biochemical_UCT_formula(self.parameters)
123 |
124 | def Biochemical_UCT_formula(self, parameters):
125 | UCTK = parameters["UCTK"]
126 | def simple_formula_inside(c, parent_visits):
127 | chem_P = c.move.chemical_score
128 | b_score = c.move.biological_score
129 | value = c.average_score + UCTK * chem_P * b_score *sqrt(parent_visits/(c.visits + 1))
130 | return(value)
131 | return(simple_formula_inside)
132 |
133 | class Biological_UCT_1(UCT_policy):
134 | """
135 | This class implements a simple biological score UCT.
136 | The selection is guided by Biological score only.
137 | """
138 | def __init__(self, parameters = {"UCTK": 3}):
139 | UCT_policy.__init__(self, policy_type = "Biological score only")
140 | self.parameters = parameters
141 | self.formula = "mean_score + {} * B * sqrt(N/(n+1))".format(parameters["UCTK"])
142 | self.function = self.Biological_UCT_formula(self.parameters)
143 |
144 | def Biological_UCT_formula(self, parameters):
145 | UCTK = parameters["UCTK"]
146 | def simple_formula_inside(c, parent_visits):
147 | b_score = c.move.biological_score
148 | value = c.average_score + UCTK * b_score *sqrt(parent_visits/(c.visits + 1))
149 | return(value)
150 | return(simple_formula_inside)
151 |
152 | class Chemical_UCT_1(UCT_policy):
153 | """
154 | This class implements a simple chemical score UCT.
155 | The selection is guided by Chemical score only.
156 | """
157 | def __init__(self, parameters = {"UCTK": 3}):
158 | UCT_policy.__init__(self, policy_type = "Chemical multiplication")
159 | self.parameters = parameters
160 | self.formula = "mean_score + {} * P_c * sqrt(N/(n+1))".format(parameters["UCTK"])
161 | self.function = self.Chemical_UCT_formula(self.parameters)
162 |
163 | # @staticmethod
164 | def Chemical_UCT_formula(self, parameters):
165 | UCTK = parameters["UCTK"]
166 | def simple_formula_inside(c, parent_visits):
167 | chem_P = c.move.chemical_score
168 | value = c.average_score + UCTK * chem_P *sqrt(parent_visits/(c.visits + 1))
169 | return(value)
170 | return(simple_formula_inside)
171 |
172 | class Biochemical_UCT_1_with_RAVE(UCT_policy):
173 | """
174 | This class implements a biochemical score UCT with RAVE augmentation.
175 | RAVE stands for Rapid Action Value Estimation:
176 | - it adds another score based on usage of identical moves elsewhere in the Tree
177 | - this is ponderated by the number of visits: as visits increase, the actual score of the node becomes more important than this initila estimation.
178 | """
179 | def __init__(self, parameters = {"UCTK": 3, "k_rave": 100}):
180 | UCT_policy.__init__(self, policy_type = "Biochemical multiplication with RAVE")
181 | self.parameters = parameters
182 | self.formula = "(1-b) mean_score + b rave_score + {} * P_c * B * sqrt(N/(n+1)) with b = sqrt({}/(3N + {})".format(parameters["UCTK"], parameters["k_rave"], parameters["k_rave"])
183 | self.function = self.Biochemical_UCT_RAVE_formula(self.parameters)
184 |
185 | def Biochemical_UCT_RAVE_formula(self, parameters):
186 | UCTK = parameters["UCTK"]
187 | k_rave = parameters["k_rave"]
188 | def simple_formula_inside(c, parent_visits):
189 | b = sqrt(k_rave/(3*parent_visits + k_rave))
190 | b_score = c.move.biological_score
191 | chem_P = c.move.chemical_score
192 | value = c.average_score * (1-b) + b * c.move.RAVE_average_score + UCTK * chem_P * b_score *sqrt(parent_visits/(c.visits + 1))
193 | return(value)
194 | return(simple_formula_inside)
195 |
196 | class Biochemical_UCT_with_progressive_bias(UCT_policy):
197 | """
198 | This class implements a biochemical score UCT and progressive bias.
199 | Progressive bias works by
200 | - giving an initial value to a node (based on expert knowledge for example)
201 | - this importance decreases as the node gets visited and this initial estimation's importance decreases in favor of actual rollouts.
202 | """
203 | def __init__(self, parameters = {"UCTK": 3, "bias_k": 1}):
204 | UCT_policy.__init__(self, policy_type = "Biochemical with progressive bias")
205 | self.parameters = parameters
206 | self.formula = "mean_score + {} * bias/(n+1) + {} * P_c * B * sqrt(N/(n+1))".format(parameters["bias_k"], parameters["UCTK"])
207 | self.function = self.Biochemical_UCT_with_bias_formula(parameters)
208 |
209 | # @staticmethod
210 | def Biochemical_UCT_with_bias_formula(self, parameters):
211 | UCTK = parameters["UCTK"]
212 | bias_k = parameters["bias_k"]
213 | def simple_formula_inside(c, parent_visits):
214 | chem_P = c.move.chemical_score
215 | b_score = c.move.biological_score
216 | bias = c.progressive_bias
217 | value = c.average_score + bias_k * bias/(c.visits +1) + UCTK * chem_P * b_score *sqrt(parent_visits/(c.visits + 1))
218 | return(value)
219 | return(simple_formula_inside)
220 |
221 | class Biochemical_UCT_with_toxicity(UCT_policy):
222 | """
223 | This class implements a biochemical score UCT combined with toxicity bias.
224 | the formula is identical to the Biochemical_UCT_with_progressive_bias, the bias being the node's toxicity.
225 | """
226 | def __init__(self, parameters = {"UCTK": 3, "bias_k": 1}):
227 | UCT_policy.__init__(self, policy_type = "Biochemical with toxicity")
228 | self.parameters = parameters
229 | self.formula = "mean_score + {} * toxicity/(n+1) + {} * P_c * B * sqrt(N/(n+1))".format(parameters["bias_k"], parameters["UCTK"])
230 | self.function = self.Biochemical_UCT_with_toxicity_formula(parameters)
231 |
232 | def Biochemical_UCT_with_toxicity_formula(self, parameters):
233 | UCTK = parameters["UCTK"]
234 | bias_k = parameters["bias_k"]
235 | def simple_formula_inside(c, parent_visits):
236 | chem_P = c.move.chemical_score
237 | b_score = c.move.biological_score
238 | toxicity = c.toxicity
239 | value = c.average_score + bias_k * toxicity/(c.visits +1) + UCTK * chem_P * b_score *sqrt(parent_visits/(c.visits + 1))
240 | return(value)
241 | return(simple_formula_inside)
242 |
--------------------------------------------------------------------------------
/biological_scoring.py:
--------------------------------------------------------------------------------
1 | """
2 | Defines the biological scoring function.
3 | Necessitates random for random scoring, and all rule sets for biological scoring.
4 | """
5 |
6 | import random
7 | from rule_sets_examples import *
8 | from rule_sets_similarity import *
9 |
10 | class BiologicalScoring(object):
11 | """
12 | Defines Biological Scorer object.
13 | Returns the biological score associated to a reaction rule.
14 | """
15 | def __init__(self, scoring_function):
16 | self.scoring_function = scoring_function
17 | self.name = "Random"
18 |
19 | def __repr__(self):
20 | return(self.name)
21 |
22 | def calculate(self, rule):
23 | score = self.scoring_function(rule)
24 | return(score)
25 |
26 | def pseudo_random(rule):
27 | score = random.uniform(0, 10)
28 | return(score)
29 |
30 | class BiologicalScoringOrganism(BiologicalScoring):
31 | """
32 | Defines Biological Scorer object from an organism with predefined scores.
33 | Inverted converts a penalty to a score.
34 | This will be analysed more in depth when biological score will evolve.
35 | """
36 | def __init__(self, rules_dictionnary, inverted = False, name = "None"):
37 | BiologicalScoring.__init__(self, scoring_function = None)
38 | self.scoring_function = self.assign_from_dict(rules_dictionnary, inverted)
39 | self.name = name
40 |
41 | def __repr__(self):
42 | return(self.name)
43 |
44 | def assign_from_dict(self, rules_dictionnary, inverted):
45 | rules_dictionnary = rules_dictionnary
46 | def simple_assign_inside(rule):
47 | score = rules_dictionnary[rule]["biological_score"]
48 | # Inverted if to use penalties instead of scors.
49 | # if inverted:
50 | # try:
51 | # return(1/score)
52 | # except ZeroDivisionError:
53 | # return(33)
54 | # else:
55 | return(score)
56 | return(simple_assign_inside)
57 |
58 |
59 | RandomBiologicalScorer = BiologicalScoring(scoring_function = pseudo_random)
60 | BiologicalFullScoringRetroH = BiologicalScoringOrganism(rules_dictionnary= full_rules_retro_H, name = "full_rules_retro_H")
61 | BiologicalFullScoringFwdH = BiologicalScoringOrganism(rules_dictionnary= full_rules_forward_H, name = "full_rules_forward_H")
62 | BiologicalFullScoringRetroNoH = BiologicalScoringOrganism(rules_dictionnary= full_rules_retro_no_H, name = "full_rules_retro_no_H")
63 | BiologicalFullScoringFwdNoH = BiologicalScoringOrganism(rules_dictionnary= full_rules_forward_no_H, name = "full_rules_forward_no_H")
64 |
65 | full_H = full_rules_retro_H
66 | full_H.update(full_rules_forward_H)
67 | BiologicalFullScoringH = BiologicalScoringOrganism(rules_dictionnary= full_H, name = "full_rules_retro_H")
68 |
--------------------------------------------------------------------------------
/calculate_organisms.py:
--------------------------------------------------------------------------------
1 | """
2 | This module loads calculates organisms
3 | - standardises compounds within the organism
4 | - saves them as pickles that can be laoded by RP3
5 | """
6 |
7 | # General utilities
8 | import logging
9 | import os
10 | import csv
11 | import sys
12 | import argparse
13 |
14 | from config import DATA_PATH
15 |
16 | # RP3 specific objects
17 | from compound import Compound
18 | from chemical_compounds_state import ChemicalCompoundState
19 | from utilities.reactor.Utils import ChemConversionError
20 |
21 |
22 | def __run__():
23 | def import_organism_from_csv(csv_file, add_Hs=True):
24 | with open(csv_file) as csv_handle:
25 | dict_reader = csv.DictReader(csv_handle, delimiter=",")
26 | compound_list = []
27 | for row in dict_reader:
28 | name = row["name"]
29 | inchi = row["inchi"]
30 | if inchi is None or inchi == "None" or inchi == "":
31 | pass
32 | else:
33 | try:
34 | if name.startswith("InChI"):
35 | compound = Compound(
36 | InChI=inchi,
37 | heavy_standardisation=True,
38 | force_add_H=add_Hs,
39 | )
40 | else:
41 | compound = Compound(
42 | InChI=inchi,
43 | name=name,
44 | heavy_standardisation=True,
45 | force_add_H=add_Hs,
46 | )
47 | if not compound.in_list(compound_list, main_layer=False):
48 | compound_list.append(compound)
49 | except ChemConversionError:
50 | logging.error(
51 | "For compound {} with inchi {}: error ChemConversionError".format(
52 | name, inchi
53 | )
54 | )
55 | organism = ChemicalCompoundState(compound_list, main_layer=False)
56 | return organism
57 |
58 | # Calculate with H ========================================================
59 | logging.info("Calculating organisms with H...")
60 |
61 | # Test organism
62 | compound_1 = Compound(
63 | "[H+]", name="1", heavy_standardisation=True, force_add_H=True
64 | )
65 | compound_6 = Compound(
66 | "[H][N]=[C]([O][H])[C]1=[C]([H])[N]([C]2([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][C]([H])([H])[C]3([H])[O][C]([H])([n]4[c]([H])[n][c]5[c]([N]([H])[H])[n][c]([H])[n][c]54)[C]([H])([O][P](=[O])([O][H])[O][H])[C]3([H])[O][H])[C]([H])([O][H])[C]2([H])[O][H])[C]([H])=[C]([H])[C]1([H])[H]",
67 | force_add_H=True,
68 | name="6",
69 | heavy_standardisation=True,
70 | )
71 | compound_3459 = Compound(
72 | "[H][O][C](=[O])[C](=[O])[C]([H])([H])[C]([H])([O][H])[C]([H])([O][H])[C]([H])([H])[H]",
73 | name="3459",
74 | heavy_standardisation=True,
75 | force_add_H=True,
76 | )
77 | test_organism = ChemicalCompoundState(
78 | state_name="Test", compound_list=[compound_1, compound_6, compound_3459]
79 | )
80 |
81 | # Load real organisms
82 | detectable_cmpds = import_organism_from_csv(
83 | f"{SINK_DATA_PATH}/detectable_metabolites_uncommented.csv", add_Hs=True
84 | )
85 | iML1515_chassis = import_organism_from_csv(
86 | f"{SINK_DATA_PATH}/ecoli_iML1515_sink_reduced_rp_ready.csv", add_Hs=True
87 | )
88 | core_ecoli = import_organism_from_csv(
89 | f"{SINK_DATA_PATH}/ecoli_core_sink_reduced_rp_ready.csv", add_Hs=True
90 | )
91 | iJO1366_chassis = import_organism_from_csv(
92 | f"{SINK_DATA_PATH}/ecoli_iJO1366_sink_reduced_rp_ready.csv", add_Hs=True
93 | )
94 | bsubtilis = import_organism_from_csv(
95 | f"{SINK_DATA_PATH}/bsubtilis_iYO844_sink_reduced_rp_ready.csv", add_Hs=True
96 | )
97 |
98 | # Save organisms
99 | test_organism.save(file_name="Test_organism_H", folder_address=ORGANISMS_DATA_PATH)
100 | detectable_cmpds.save(
101 | file_name="detectable_cmpds_H", folder_address=ORGANISMS_DATA_PATH
102 | )
103 | iML1515_chassis.save(
104 | file_name="iML1515_chassis_H", folder_address=ORGANISMS_DATA_PATH
105 | )
106 | core_ecoli.save(file_name="core_ecoli_H", folder_address=ORGANISMS_DATA_PATH)
107 | iJO1366_chassis.save(
108 | file_name="iJO1366_chassis_H", folder_address=ORGANISMS_DATA_PATH
109 | )
110 | bsubtilis.save(file_name="bsubtilis_H", folder_address=ORGANISMS_DATA_PATH)
111 |
112 | # Calculate without H =====================================================
113 | logging.info("Calculating organisms without H...")
114 |
115 | # Test organism
116 | compound_1 = Compound(
117 | "[H+]", name="1", heavy_standardisation=True, force_add_H=False
118 | )
119 | compound_6 = Compound(
120 | "[H][N]=[C]([O][H])[C]1=[C]([H])[N]([C]2([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][C]([H])([H])[C]3([H])[O][C]([H])([n]4[c]([H])[n][c]5[c]([N]([H])[H])[n][c]([H])[n][c]54)[C]([H])([O][P](=[O])([O][H])[O][H])[C]3([H])[O][H])[C]([H])([O][H])[C]2([H])[O][H])[C]([H])=[C]([H])[C]1([H])[H]",
121 | force_add_H=False,
122 | name="6",
123 | heavy_standardisation=True,
124 | )
125 | compound_3459 = Compound(
126 | "[H][O][C](=[O])[C](=[O])[C]([H])([H])[C]([H])([O][H])[C]([H])([O][H])[C]([H])([H])[H]",
127 | name="3459",
128 | heavy_standardisation=True,
129 | force_add_H=False,
130 | )
131 | test_organism = ChemicalCompoundState(
132 | state_name="Test", compound_list=[compound_1, compound_6, compound_3459]
133 | )
134 |
135 | # Load real organisms
136 | detectable_cmpds = import_organism_from_csv(
137 | f"{SINK_DATA_PATH}/detectable_metabolites_uncommented.csv", add_Hs=True
138 | )
139 | iML1515_chassis = import_organism_from_csv(
140 | f"{SINK_DATA_PATH}/ecoli_iML1515_sink_reduced_rp_ready.csv", add_Hs=False
141 | )
142 | core_ecoli = import_organism_from_csv(
143 | f"{SINK_DATA_PATH}/ecoli_core_sink_reduced_rp_ready.csv", add_Hs=False
144 | )
145 | iJO1366_chassis = import_organism_from_csv(
146 | f"{SINK_DATA_PATH}/ecoli_iJO1366_sink_reduced_rp_ready.csv", add_Hs=False
147 | )
148 | bsubtilis = import_organism_from_csv(
149 | f"{SINK_DATA_PATH}/bsubtilis_iYO844_sink_reduced_rp_ready.csv", add_Hs=False
150 | )
151 |
152 | # Save organisms
153 | test_organism.save(
154 | file_name="Test_organism_noH", folder_address=ORGANISMS_DATA_PATH
155 | )
156 | detectable_cmpds.save(
157 | file_name="detectable_cmpds_noH", folder_address=ORGANISMS_DATA_PATH
158 | )
159 | iML1515_chassis.save(
160 | file_name="iML1515_chassis_noH", folder_address=ORGANISMS_DATA_PATH
161 | )
162 | core_ecoli.save(file_name="core_ecoli_noH", folder_address=ORGANISMS_DATA_PATH)
163 | iJO1366_chassis.save(
164 | file_name="iJO1366_chassis_noH", folder_address=ORGANISMS_DATA_PATH
165 | )
166 | bsubtilis.save(file_name="bsubtilis_noH", folder_address=ORGANISMS_DATA_PATH)
167 |
168 | return 0
169 |
170 |
171 | if __name__ == "__main__":
172 | d = "Formatting organisms in a RP3 compatible format"
173 | parser = argparse.ArgumentParser(description=d)
174 | parser.add_argument(
175 | "--terminal",
176 | help="Default logger is logs_organisms_set_up, switch to terminal if specified",
177 | action="store_true",
178 | default=False,
179 | )
180 | args = parser.parse_args()
181 |
182 | # Sink data path
183 | global SINK_DATA_PATH
184 | SINK_DATA_PATH = f"{DATA_PATH}/sinks"
185 | assert os.path.exists(
186 | SINK_DATA_PATH
187 | ), f"Sink data path {SINK_DATA_PATH} does not exist"
188 |
189 | # Organisms data path
190 | global ORGANISMS_DATA_PATH
191 | ORGANISMS_DATA_PATH = f"{DATA_PATH}/organisms"
192 | if not os.path.exists(ORGANISMS_DATA_PATH):
193 | os.mkdir(ORGANISMS_DATA_PATH)
194 |
195 | if args.terminal is True:
196 | logging.basicConfig(
197 | stream=sys.stderr,
198 | level=logging.INFO,
199 | datefmt="%d/%m/%Y %H:%M:%S",
200 | format="%(asctime)s -- %(levelname)s -- %(message)s",
201 | )
202 | else:
203 | logging.basicConfig(
204 | stream=open(
205 | "{}/{}.log".format(ORGANISMS_DATA_PATH, "logs_organisms_set_up"), "w"
206 | ),
207 | level=logging.INFO,
208 | datefmt="%d/%m/%Y %H:%M:%S",
209 | format="%(asctime)s -- %(levelname)s -- %(message)s",
210 | )
211 | print(
212 | f"By default, logs are saved in {ORGANISMS_DATA_PATH}/logs_organisms_set_up.log. Please use --terminal to redirect to sys.stderr"
213 | )
214 | __run__()
215 |
--------------------------------------------------------------------------------
/change_config.py:
--------------------------------------------------------------------------------
1 | """
2 | The aim of this script is to change configuration file from command line.
3 | It takes as input the base config file from data.
4 | """
5 |
6 | import argparse
7 | import re
8 | import os
9 |
10 |
11 | def __cli():
12 | """
13 | Command line interface.
14 | """
15 |
16 | d = "Arguments to change the config file before running a Tree"
17 | parser = argparse.ArgumentParser(description=d)
18 | # Logs and saving information
19 | parser.add_argument(
20 | "--DB_CACHE", type=lambda x: (str(x).lower() == "true"), default=False
21 | )
22 | parser.add_argument(
23 | "--DB_REPLACE", type=lambda x: (str(x).lower() == "true"), default=False
24 | )
25 | parser.add_argument("--DB_time", default=1, type=float)
26 | parser.add_argument(
27 | "--biosensor", type=lambda x: (str(x).lower() == "true"), default=False
28 | )
29 | parser.add_argument(
30 | "--use_cache", type=lambda x: (str(x).lower() == "true"), default=False
31 | )
32 | parser.add_argument(
33 | "--add_Hs", type=lambda x: (str(x).lower() == "true"), default=False
34 | )
35 | parser.add_argument(
36 | "--use_transpositions", type=lambda x: (str(x).lower() == "true"), default=False
37 | )
38 | parser.add_argument(
39 | "--use_transpositions_depth",
40 | type=lambda x: (str(x).lower() == "true"),
41 | default=False,
42 | )
43 | parser.add_argument(
44 | "--folder_to_save", default=os.path.dirname(os.path.abspath(__file__))
45 | )
46 | args = parser.parse_args()
47 |
48 | def change_dB_setting(
49 | DB_CACHE,
50 | DB_REPLACE,
51 | DB_time,
52 | biosensor,
53 | use_cache,
54 | add_Hs,
55 | use_transpositions,
56 | use_transpositions_depth,
57 | folder_to_save,
58 | ):
59 | with open(
60 | "{}/data/base_config.py".format(os.path.dirname(os.path.abspath(__file__))),
61 | "r",
62 | ) as file_original:
63 | whole_text = file_original.read()
64 | with open("{}/config.py".format(folder_to_save), "w") as replacement_text:
65 | # Changing DB_cache
66 | if DB_CACHE:
67 | if "DB_CACHE = True" not in whole_text:
68 | whole_text = whole_text.replace(
69 | "DB_CACHE = False", "DB_CACHE = True"
70 | )
71 | else:
72 | if "DB_CACHE = False" not in whole_text:
73 | whole_text = whole_text.replace(
74 | "DB_CACHE = True", "DB_CACHE = False"
75 | )
76 | # Changing DB replace
77 | if DB_REPLACE:
78 | if "DB_REPLACE = True" not in whole_text:
79 | whole_text = whole_text.replace(
80 | "DB_REPLACE = False", "DB_REPLACE = True"
81 | )
82 | else:
83 | if "DB_REPLACE = False" not in whole_text:
84 | whole_text = whole_text.replace(
85 | "DB_REPLACE = True", "DB_REPLACE = False"
86 | )
87 | # Changing DB_time:
88 | whole_text = re.sub(
89 | "DB_time = \d+.\d+", "DB_time = {}".format(DB_time), whole_text
90 | )
91 |
92 | # Changing running mode from biosensor to retrosynthesis
93 | if biosensor:
94 | if "biosensor = True" not in whole_text:
95 | whole_text = whole_text.replace(
96 | "biosensor = False", "biosensor = True"
97 | )
98 | whole_text = whole_text.replace(
99 | "retrosynthesis = True", "retrosynthesis = False"
100 | )
101 | else:
102 | if "biosensor = False" not in whole_text:
103 | whole_text = whole_text.replace(
104 | "biosensor = True", "biosensor = False"
105 | )
106 | whole_text = whole_text.replace(
107 | "retrosynthesis = False", "retrosynthesis = True"
108 | )
109 | # Changing use_cache
110 | if use_cache:
111 | if "use_cache = True" not in whole_text:
112 | whole_text = whole_text.replace(
113 | "use_cache = False", "use_cache = True"
114 | )
115 | else:
116 | if "use_cache = False" not in whole_text:
117 | whole_text = whole_text.replace(
118 | "use_cache = True", "use_cache = False"
119 | )
120 |
121 | # Hydrogen handling:
122 | if add_Hs:
123 | if "add_Hs = True" not in whole_text:
124 | whole_text = whole_text.replace("add_Hs = False", "add_Hs = True")
125 | else:
126 | if "add_Hs = False" not in whole_text:
127 | whole_text = whole_text.replace("add_Hs = True", "add_Hs = False")
128 |
129 | # Changing use_transpositions
130 | if use_transpositions:
131 | if "use_transpositions = True" not in whole_text:
132 | whole_text = whole_text.replace(
133 | "use_transpositions = False", "use_transpositions = True"
134 | )
135 | else:
136 | if "use_transpositions = False" not in whole_text:
137 | whole_text = whole_text.replace(
138 | "use_transpositions = True", "use_transpositions = False"
139 | )
140 | # Changing use_transpositions_depth
141 | if use_transpositions_depth:
142 | if "use_transpositions_depth = True" not in whole_text:
143 | whole_text = whole_text.replace(
144 | "use_transpositions_depth = False",
145 | "use_transpositions_depth = True",
146 | )
147 | else:
148 | if "use_transpositions_depth = False" not in whole_text:
149 | whole_text = whole_text.replace(
150 | "use_transpositions_depth = True",
151 | "use_transpositions_depth = False",
152 | )
153 | replacement_text.write(whole_text)
154 |
155 | change_dB_setting(
156 | DB_CACHE=args.DB_CACHE,
157 | DB_REPLACE=args.DB_REPLACE,
158 | DB_time=args.DB_time,
159 | biosensor=args.biosensor,
160 | use_cache=args.use_cache,
161 | add_Hs=args.add_Hs,
162 | use_transpositions=args.use_transpositions,
163 | use_transpositions_depth=args.use_transpositions_depth,
164 | folder_to_save=args.folder_to_save,
165 | )
166 |
167 |
168 | if __name__ == "__main__":
169 | __cli()
170 |
--------------------------------------------------------------------------------
/chemical_scoring.py:
--------------------------------------------------------------------------------
1 | """
2 | Defines the chemical scoring functions
3 | """
4 |
5 | # General utility packages
6 | import random
7 | import itertools # For all permutations when IDing the best products
8 | import numpy as np # Allows for simpler calculations on lists
9 | import logging
10 |
11 | # Chemistry packages
12 | from rdkit import DataStructs # For similarity computation
13 |
14 | def list_product(combination):
15 | """
16 | Calculates the product of all elements from the list.
17 | Remark: deprecated, use geometric mean instead.
18 | """
19 | score = 1
20 | for tanimoto in combination:
21 | score = score * tanimoto
22 | return(score)
23 |
24 | def combine_products(product_list, product_list_bis, max_combination = 1000):
25 | """
26 | Calculates all possible combinations of products (native and query products).
27 | Limited to 1000 combinations, knowing that combinations behave as n! with n the number of products.
28 | """
29 | combinations = [(x,product_list_bis) for x in itertools.permutations(product_list,len(product_list_bis))]
30 | if len(combinations) > max_combination:
31 | combinations = combinations[0:max_combination]
32 | return(combinations)
33 |
34 | def list_geometric_mean(combination):
35 | """
36 | Calculates the geometric mean of the array.
37 | """
38 | a = np.array(combination)
39 | return a.prod()**(1.0/len(a))
40 |
41 | def tanimoto_product_calc(native_products_ecfp, query_products_ecfp, verbose = False):
42 | all_scores = []
43 | if len(native_products_ecfp) != len(query_products_ecfp):
44 | # Reject rules that do not produce the same number of compounds.
45 | logging.debug("Rule does not generate the same number of products: native is {} and new is {}".format(len(native_products_ecfp), len(query_products_ecfp)))
46 | return(-1)
47 | combinations = combine_products(product_list = native_products_ecfp, product_list_bis = query_products_ecfp)
48 | score_list = []
49 | for combination in combinations:
50 | tanimoto_combination = []
51 | native, query = combination[0], combination[1]
52 | for i in range(len(native)):
53 | tanimoto = DataStructs.cDataStructs.TanimotoSimilarity(native[i], query[i])
54 | tanimoto_combination.append(tanimoto)
55 | score_list.append(list_geometric_mean(tanimoto_combination))
56 | if verbose:
57 | logging.debug("Score list length is {} and scores {}".format(len(score_list), score_list))
58 | return(max(score_list))
59 |
60 | class ChemicalScoring(object):
61 | logger = logging.getLogger(__name__)
62 | """
63 | Defines Chemical Scorer objects.
64 | """
65 | def __init__(self, scoring_function, name = "ChemicalScoring"):
66 | self.scoring_function = scoring_function
67 | self.scoring_warning = True
68 | self.name = name
69 |
70 | def calculate(self, compound, products = None, rule = None, original_substrates_list = None, original_products_list_list = None):
71 | if original_substrates_list == [None] and (original_products_list_list is None or original_products_list_list == [None]):
72 | if self.scoring_warning:
73 | self.scoring_warning = False
74 | self.logger.warning("Not using chemical scoring for {}. Default is set to 1".format(self.name))
75 | return(1)
76 | score, warning = self.scoring_function(compound, products, rule, original_substrates_list, original_products_list_list)
77 | if not warning is None:
78 | self.logger.debug(warning)
79 | return(score)
80 |
81 | def pseudo_random(compound, products, rule, original_substrates_list = None, original_products_list_list = None):
82 | """
83 | Was used during development.
84 | """
85 | warning = None
86 | if compound.InChIKey == "NBBJYMSMWIIQGU-UHFFFAOYSA-N":
87 | if rule == "MNXR94682_MNXM821":
88 | score = 0.99
89 | elif rule == "MNXR117465_MNXM821":
90 | score = 0.88
91 | else:
92 | score = random.uniform(0,0.75)
93 | elif compound.InChIKey == "DNIAPMSPPWPWGF-UHFFFAOYSA-N":
94 | if rule == "MNXR95713_MNXM90191":
95 | score = 0.80
96 | elif rule == "MNXR103108_MNXM90191":
97 | score = 0.76
98 | else:
99 | score = random.uniform(0,0.75)
100 | else:
101 | score = random.uniform(0,0.75)
102 | return(score, warning)
103 |
104 | def substrate_calculation(compound, products = None, rule = None, original_substrates_list = None, original_products_list_list = None):
105 | """
106 | If the original_substrates_list is none, it means chemical scoring is not implemented and scoring should eb neutral: 1 in mutiplication.
107 | """
108 | if original_substrates_list is None:
109 | warning = "Score is set to 1 for cmp {} and rule {}".format(compound, rule)
110 | tanimoto = 1
111 | else:
112 | tanimoto = 0
113 | for native_substrate in original_substrates_list:
114 | query_substrate = compound._get_ECFP()
115 | tanimoto_this = DataStructs.cDataStructs.TanimotoSimilarity(query_substrate, native_substrate)
116 | warning = None
117 | tanimoto = max(tanimoto, tanimoto_this)
118 | return(tanimoto, warning)
119 |
120 | def substrate_and_product_calculation(compound, products, rule, original_substrates_list = None, original_products_list_list = None):
121 | """
122 | If the original_substrates_list is none, it means chemical scoring is not implemented and scoring should eb neutral: 1 in mutiplication.
123 | """
124 | warning = None
125 | if original_substrates_list is None:
126 | warning = "Score is set to 1 for cmp {} and rule {}".format(compound, rule)
127 | tanimoto = 1
128 | return(tanimoto, warning)
129 | else:
130 | tanimoto = 0
131 | for i in range(len(original_substrates_list)):
132 | native_substrate = original_substrates_list[i]
133 | query_substrate = compound._get_ECFP()
134 | tanimoto_substrate = DataStructs.cDataStructs.TanimotoSimilarity(query_substrate, native_substrate)
135 | warning = None
136 | query_products_ecfp = []
137 | for prod in products:
138 | query_products_ecfp.append(prod._get_ECFP())
139 | prod_result = tanimoto_product_calc(original_products_list_list[i], query_products_ecfp, verbose = False)
140 | if prod_result == -1:
141 | warning = "Number of product issue with rule {} and products {}".format(rule, products)
142 | tanimoto = max(tanimoto, tanimoto_substrate * prod_result)
143 | return(tanimoto, warning)
144 |
145 | def constant_scorer(compound, products, rule, original_substrates_list = None, original_products_list_list = None):
146 | warning = None
147 | return(1, warning)
148 |
149 | RandomChemicalScorer = ChemicalScoring(scoring_function = pseudo_random, name = "RandomChemicalScorer")
150 | SubstrateChemicalScorer = ChemicalScoring(scoring_function = substrate_calculation, name = "SubstrateChemicalScorer")
151 | SubandprodChemicalScorer = ChemicalScoring(scoring_function = substrate_and_product_calculation, name = "SubandprodChemicalScorer")
152 | ConstantChemicalScorer = ChemicalScoring(scoring_function = constant_scorer, name = "ConstantChemicalScorer")
153 | # Chemical scoring utilities. Taken from similarity.
154 |
--------------------------------------------------------------------------------
/chemistry_choices.md:
--------------------------------------------------------------------------------
1 | The aim of this file is to document choices that handled bug correction and why, concerning precise chemoinformatics choices.
2 |
3 | # Chemical rule application.
4 |
5 | When a rule applies to a substrate and after standardisation produces this substrate again (S -> S + I), the rule is deleted as this is not biological.
6 | This is corrected at the compound stage.
7 |
8 | # Compound equality: either main layer or full inchikey
9 | Choices: usually less stringent for the chassis.
10 |
11 | # Moves generating duplicate compounds:
12 | - Only unique compounds are conserved.
13 | - Logs will say it is merged (and conserve the number of compounds in stoechiometry dictionnary)
14 |
15 |
16 | # Moves generating the same compounds:
17 |
18 | Keep the one with the higher score. In practice, also keeping the synonyms (transformation IDs) of the other moves generating the same compounds.
19 |
20 | # History of the state.
21 |
22 | Keeping a history of visited compounds (excluding the organism's compounds).
23 | - Refuse moves that generate compounds present in the history to avoid loops.
24 |
25 | # Refusing rules that produce a different number of compounds as the original.
26 |
27 | This can happen when the rule that learned on 2 molecules, which subgroups appear in a much bigger molecule when doing the retrosynthetic search.
28 | It is unrealistic to expect an enzyme to work this way.
29 |
--------------------------------------------------------------------------------
/compound_scoring.py:
--------------------------------------------------------------------------------
1 | """
2 | Defines the compound scoring function.
3 | Currently implements toxicity in E. coli, based on data from EcoliTox.
4 | """
5 |
6 | # General use packages
7 | import random
8 | import numpy as np
9 | import sys
10 | import csv
11 | import math
12 | import logging
13 | from rdkit.Chem import DataStructs
14 | from rdkit import Chem
15 |
16 | from config import *
17 |
18 |
19 | class CompoundScoring(object):
20 | """
21 | Defines Compound Scorer object.
22 | """
23 | logger = logging.getLogger(__name__)
24 | def __init__(self, scoring_function = None):
25 | if scoring_function is None:
26 | pass
27 | else:
28 | self.scoring_function = scoring_function
29 |
30 | def __repr__(self):
31 | """
32 | Name the used scorer.
33 | Raises an error is the class is not properly instanciated
34 | """
35 | return(self.name)
36 |
37 | def calculate(self, compound):
38 | score = self.scoring_function(compound)
39 | return(score)
40 |
41 | def pseudo_random(compound):
42 | score = random.uniform(0, 10)
43 | return(score)
44 |
45 |
46 | class ToxicityScoring(CompoundScoring):
47 | """
48 | Returns the log toxicity value of a compound.
49 | The data is stored in a csv file, tab delimited, with columns "name", 'InChI' and "toxicity"
50 | This can easily be changed to another data with a similar formatting.
51 | """
52 | def __init__(self, toxicity_data = "{}/name_structure_toxicity.csv".format(DATA_PATH)):
53 | CompoundScoring.__init__(self)
54 | self.scoring_function = self.scoring_function()
55 | self.name = "ToxicityScoring"
56 | self.fit_model(toxicity_data)
57 |
58 | def calculate_ECFP(self,inchi):
59 | rdmol = Chem.inchi.MolFromInchi(inchi, sanitize=False)
60 | # rd_mol = standardize_chemical(rdmol, add_hs=False, heavy = True, rm_stereo=True)
61 | ECFP= Chem.AllChem.GetMorganFingerprintAsBitVect(rdmol, radius = 2, nBits=1024, useFeatures = False, useChirality = False)
62 | return(ECFP)
63 |
64 | def select_current_best_model(self, X, y,
65 | models_number = 10,
66 | verbose = False):
67 |
68 | trained_model_list = []
69 | # Training all models
70 | for i in range(models_number):
71 | X_train, y_train = X, y
72 | other_MLP = MLPRegressor(hidden_layer_sizes = (10, 100,100, 20), solver ="adam", max_iter=20000,
73 | early_stopping = True, learning_rate = "adaptive")
74 | other_MLP.fit(X_train, y_train.flatten())
75 | trained_model_list.append(other_MLP)
76 |
77 | big_MLP = MLPRegressor(hidden_layer_sizes = (100,100, 20),solver ="adam", max_iter=20000,
78 | early_stopping = True, learning_rate = "adaptive")
79 | big_MLP.fit(X_train, y_train.flatten())
80 | trained_model_list.append(big_MLP)
81 |
82 |
83 | medium_MLP = MLPRegressor(hidden_layer_sizes = (40, 10), solver ="adam", max_iter=20000,
84 | early_stopping = True, learning_rate = "adaptive")
85 | medium_MLP.fit(X_train, y_train.flatten())
86 | trained_model_list.append(medium_MLP)
87 |
88 | small_MLP = MLPRegressor(hidden_layer_sizes = (10), solver ="adam", max_iter=20000,
89 | early_stopping = True, learning_rate = "adaptive")
90 | small_MLP.fit(X_train, y_train.flatten())
91 | trained_model_list.append(small_MLP)
92 |
93 | # Evaluating all
94 | all_scores = []
95 | for i in range(len(trained_model_list)):
96 | selected_mdoel = trained_model_list[i]
97 | y_pred = selected_mdoel.predict(X)
98 | score = sklearn.metrics.r2_score(y, y_pred)
99 | all_scores.append(score)
100 |
101 | try:
102 | best_index = all_scores.index(max(all_scores))
103 | best_score = all_scores[best_index]
104 | except ValueError:
105 | best_index = 0
106 | best_model = trained_model_list[best_index]
107 | return(best_model, best_score)
108 |
109 | def fit_model(self,toxicity_data):
110 | y = []
111 | X = None
112 | # Loading data
113 | with open(toxicity_data, "r") as file_hdl:
114 | reader = csv.DictReader(file_hdl, delimiter = '\t')
115 | for row in reader:
116 | y.append(math.log(float(row["toxicity"])))
117 | arr = np.zeros((1,))
118 | fp = self.calculate_ECFP(row["InChI"])
119 | DataStructs.ConvertToNumpyArray(fp, arr)
120 | arr = np.reshape(arr, (1, 1024))
121 | if X is None:
122 | X = arr
123 | else:
124 | X = np.concatenate((X, arr), axis = 0)
125 | self.log_loading = "Loaded {} compounds from {}".format(len(y), toxicity_data)
126 | y = np.array(y)
127 | # Fitting mdoel:
128 | best_model, score = self.select_current_best_model(X, y, models_number = 10)
129 | y_pred = best_model.predict(X)
130 | score = sklearn.metrics.r2_score(y, y_pred)
131 | self.log_score = "The toxicity model has a R2 score of {} on itself".format(round(score, 2))
132 | self.model = best_model
133 |
134 | def scoring_function(self):
135 | # CODE IT
136 | def compound_scoring(compound):
137 | ECFP = compound._get_ECFP()
138 | arr = np.zeros((1,))
139 | DataStructs.ConvertToNumpyArray(ECFP, arr)
140 | arr = np.reshape(arr, (1, 1024))
141 | y_pred = self.model.predict(arr)
142 | return(y_pred)
143 | return(compound_scoring)
144 |
145 |
146 | RandomCompoundScorer = CompoundScoring(scoring_function = pseudo_random)
147 | if use_toxicity:
148 | toxicity_scorer = ToxicityScoring()
149 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | """
2 | the aim of this file is to store configuration parameters, notably for the DB.
3 | It replaces what I previously wanted to define as 'global'
4 | """
5 | try:
6 | from rp3_dcache.Manager import Manager # In house module
7 | from rp3_dcache.Utils import make_document_id, as_document, rdmols_from_document
8 | dcache_installed = True
9 | except ModuleNotFoundError:
10 | dcache_installed = False
11 | import logging
12 | import os
13 |
14 | # Files and addresses configurations - should not be modified:
15 | global DATA_PATH
16 | DATA_PATH = "{}/data".format(os.path.dirname(os.path.abspath(__file__)))
17 |
18 | global add_Hs
19 | add_Hs = True
20 | hydrogen_config = "Using explicit hydrogens : {}".format(add_Hs)
21 |
22 | # Database for storing results configuration
23 | global DB_CACHE
24 | global DB_REPLACE
25 | DB_CACHE = False and dcache_installed
26 | DB_REPLACE = False and dcache_installed
27 | DB_time = 0
28 | if DB_CACHE:
29 | global CACHE_MGR
30 | if add_Hs:
31 | CACHE_MGR = Manager(replace=DB_REPLACE, collection = "results_with_H")
32 | else:
33 | CACHE_MGR = Manager(replace=DB_REPLACE, collection = "results_without_H")
34 | CACHE_MGR.connect()
35 | DB_config = "Setting the DB from config file: Installed package: {}. Using cache DB: {}; Replacing results: {}".format(dcache_installed, DB_CACHE, DB_REPLACE)
36 | elif dcache_installed:
37 | DB_config = "Setting the DB from config file: Installed package: {}. Using cache DB: {}; Replacing results: {}".format(dcache_installed, DB_CACHE, DB_REPLACE)
38 | else:
39 | DB_config = "Setting the DB from config file: Installed package: {}".format(dcache_installed)
40 |
41 | # Mode for using RP3: retrosynthesis or biosensor. QSAR might be implemented one day.
42 | global retrosynthesis
43 | global biosensor
44 | retrosynthesis = True
45 | biosensor = False
46 | tree_mode_config = "Using retrosynthesis: {} - using biosensor {}".format(retrosynthesis, biosensor)
47 |
48 | # Configuring local cache. Could be replaced by a proper caching system one day.
49 | global home_made_cache
50 | home_made_cache = {}
51 |
52 | global use_cache
53 | use_cache = False
54 |
55 | cache_config = "Initialising an empty cache: {}; Using it: {}".format(home_made_cache, use_cache)
56 |
57 | # MCTS parameters for configuration
58 |
59 | global transposition_table
60 | global use_transpositions
61 | global use_transpositions_depth
62 |
63 | transposition_table = {}
64 | use_transpositions = False
65 | use_transpositions_depth = False
66 |
67 | transposition_table_config = "Using transposition tables: {}. With depth: {}".format(use_transpositions, use_transpositions_depth)
68 |
69 | # For toxicity, using log(IC50) as penaly when below 0.
70 | global use_toxicity
71 | try:
72 | import sklearn
73 | from sklearn.neural_network import MLPRegressor
74 | sklearn_here = True
75 | except ModuleNotFoundError:
76 | toxicity_config = "Toxicity will not be enabled because sklearn is not installed"
77 | sklearn_here = False
78 | use_toxicity = False
79 | use_toxicity = use_toxicity and sklearn_here
80 |
--------------------------------------------------------------------------------
/convert_to_SBML.py:
--------------------------------------------------------------------------------
1 | """
2 | Converts pathways under json format to SBML format
3 | """
4 |
5 | # General utilities
6 | import sys
7 | import logging
8 | import csv
9 | import copy
10 | import json
11 | import pickle
12 | import libsbml
13 | from hashlib import md5
14 | import os
15 | import argparse
16 |
17 | # RP3 specific objects
18 | from compound import Compound
19 | from move import Move
20 |
21 | def _nameToSbmlId(name):
22 | IdStream = []
23 | count = 0
24 | end = len(name)
25 | if '0' <= name[count] and name[count] <= '9':
26 | IdStream.append('_')
27 | for count in range(0, end):
28 | if (('0' <= name[count] and name[count] <= '9') or
29 | ('a' <= name[count] and name[count] <= 'z') or
30 | ('A' <= name[count] and name[count] <= 'Z')):
31 | IdStream.append(name[count])
32 | else:
33 | IdStream.append('_')
34 | Id = ''.join(IdStream)
35 | if Id[len(Id) - 1] != '_':
36 | return Id
37 | return Id[:-1]
38 |
39 | def add_specy(sbml_model,
40 | chemId = 'Id_cmpound',
41 | smiles = "smilescomppoun",
42 | inchi = "inchicompounds",
43 | inchiKey = "inchiKeycomppoun",
44 | name = "compounds_name",
45 | in_sink = False):
46 |
47 | spe = sbml_model.createSpecies()
48 | spe.setCompartment("cytoplasm")
49 | spe.setHasOnlySubstanceUnits(False)
50 | spe.setBoundaryCondition(False)
51 | spe.setConstant(False)
52 | spe.setInitialConcentration(1.0)
53 | clean_id = str(chemId)+'__64__'+str("cytoplasm")
54 | clean_id = clean_id.replace('-', '_') # No - in name
55 | metaid = _nameToSbmlId(md5(str(name).encode('utf-8')).hexdigest())
56 | spe.setMetaId(metaid)
57 | spe.setName(name)
58 | if in_sink:
59 | annotation = '''
60 | '''
62 | annotation += '''
63 |
64 |
65 | '''+str(smiles or '')+'''
66 | '''+str(inchi or '')+'''
67 | '''+str(inchiKey or '')+'''
68 | '''+ str(True)+'''
69 |
70 | '''
71 | annotation += '''
72 |
73 | '''
74 | else:
75 | annotation = '''
76 | '''
78 | annotation += '''
79 |
80 |
81 | '''+str(smiles or '')+'''
82 | '''+str(inchi or '')+'''
83 | '''+str(inchiKey or '')+'''
84 |
85 | '''
86 | annotation += '''
87 |
88 | '''
89 | spe.setAnnotation(annotation)
90 | return(sbml_model)
91 |
92 | def add_reaction(sbml_model,
93 | reacId = 'Id_reac',
94 | ec = "Test_ec",
95 | rule_id = "rule_id",
96 | biological_score = "biological_score",
97 | chemical_score = "chemical_score",
98 | reactant_stoechio = {},
99 | product = "product_name",
100 | reaction_smiles = "reaction_smiles",
101 | diameter = "diameter"):
102 | reac = sbml_model.createReaction()
103 |
104 | reac_fbc = reac.getPlugin('fbc')
105 | reac_fbc.setUpperFluxBound('B_999999')
106 | reac_fbc.setLowerFluxBound('B_0')
107 | #reactions
108 | reac.setId(reacId)
109 | reac.setSBOTerm(185)
110 | reac.setReversible(True)
111 | reac.setFast(False)
112 | metaid = _nameToSbmlId(md5(str(reacId).encode('utf-8')).hexdigest())
113 | reac.setMetaId(metaid)
114 | #reactants_dict
115 | for reactant in reactant_stoechio.keys():
116 | chemId = reactant
117 | spe = reac.createReactant()
118 | clean_id = str(chemId)+'__64__'+str("cytoplasm")
119 | clean_id = clean_id.replace('-', '_') # No - in name
120 | spe.setSpecies(clean_id)
121 | spe.setConstant(True)
122 | try:
123 | stoechio = reactant_stoechio[reactant]
124 | except KeyError:
125 | stoechio = 1
126 | spe.setStoichiometry(stoechio)
127 | #products_dict
128 | if not product is None:
129 | pro = reac.createProduct()
130 | clean_id = str(product)+'__64__'+str("cytoplasm")
131 | clean_id = clean_id.replace('-', '_') # No - in name
132 | pro.setSpecies(clean_id)
133 | pro.setConstant(True)
134 | pro.setStoichiometry(1)
135 | #annotation
136 | annotation = '''
137 | '''
139 |
140 | annotation += '''
141 |
142 |
143 | '''+str(reaction_smiles or '')+'''
144 | '''+str(rule_id or '')+'''
145 | '''+str(ec)+'''
146 |
147 |
148 |
149 |
150 |
151 |
152 | '''
153 | reac.setAnnotation(annotation)
154 | return(sbml_model)
155 |
156 |
157 | def convert_json_to_SBML(json_file, modelID = "test", folder_to_save = 'temp'):
158 | # Set up the empty model
159 | smbl_namespace = libsbml.SBMLNamespaces(3,1)
160 | smbl_namespace.addPkgNamespace('fbc',2)
161 | smbl_namespace.addPkgNamespace('groups',2)
162 | document = libsbml.SBMLDocument(smbl_namespace)
163 | sbml_model = document.createModel()
164 | sbml_model.getPlugin('fbc')
165 | sbml_model.getPlugin('groups')
166 | sbml_model.setId(modelID)
167 | sbml_model.setName(modelID)
168 | sbml_model.setTimeUnits('second')
169 | sbml_model.setExtentUnits('mole')
170 | sbml_model.setSubstanceUnits('mole')
171 | # Could implement units, currently removed from the model
172 | # Should have it in a seperate function
173 | compartment = sbml_model.createCompartment()
174 | compartment.setId("cytoplasm")
175 | target_node = None
176 | for node in json_file["elements"]["nodes"]:
177 | if node["data"]["type"] == "compound":
178 | sbml_model = add_specy(sbml_model,
179 | chemId = node["data"]["id"],
180 | smiles = node["data"]["SMILES"],
181 | inchi = node["data"]["InChI"],
182 | inchiKey = node["data"]["id"],
183 | name = ",".join(node["data"]["Names"]),
184 | in_sink = node["data"]["inSink"] == 1)
185 | if node["data"]["isSource"] == 1:
186 | logging.info("Target node is {}".format(node["data"]["id"]))
187 | target_node = node
188 | for element in sbml_model.getListOfSpecies():
189 | logging.debug(element)
190 | for node in json_file["elements"]["nodes"]:
191 | if node["data"]["type"] == "reaction":
192 | try:
193 | reactant_stoechio = node["data"]["Stoechiometry"]
194 | except KeyError:
195 | reactant_stoechio = {}
196 | sbml_model = add_reaction(sbml_model,
197 | reacId = node["data"]["id"],
198 | ec = ','.join(node["data"]["EC number"]),
199 | rule_id = ','.join(node["data"]["Rule ID"]),
200 | biological_score = node["data"]["Score"],
201 | chemical_score = node["data"]["ChemicalScore"],
202 | reactant_stoechio = reactant_stoechio,
203 | product = node["data"]["id"].split("-RR")[0],
204 | reaction_smiles = node["data"]["Reaction SMILES"],
205 | diameter = node["data"]["Diameter"])
206 | sbml_model = add_reaction(sbml_model,
207 | reacId = "production",
208 | ec = 'NA',
209 | rule_id = 'NA',
210 | biological_score = 'NA',
211 | chemical_score = 'NA',
212 | reactant_stoechio = {target_node["data"]["id"]: 1},
213 | product = None,
214 | reaction_smiles = 'NA',
215 | diameter = 'NA')
216 |
217 | document.setModel(sbml_model)
218 | libsbml.writeSBMLToFile(document,'{}/{}.xml'.format(folder_to_save, modelID))
219 | pass
220 |
221 |
222 | def __cli():
223 | def define_folder_to_save(folder):
224 | if folder is None:
225 | folder_to_save = os.path.join('debugging_results', args.c_name)
226 | else:
227 | folder_to_save = folder
228 | if not os.path.exists(folder_to_save):
229 | os.makedirs(folder_to_save, exist_ok=True)
230 | return folder_to_save
231 | d = "Command line interface to convert json files to SBML files"
232 | parser = argparse.ArgumentParser(description=d)
233 | # Logs and saving information
234 | """Command line interface to convert json files to SBML files"""
235 | parser.add_argument("--verbose", help="Default logger is INFO, switch to DEBUG is specified",
236 | dest='verbose', action='store_true', default=False)
237 | parser.add_argument("--log_file", help="Default logger is stderr, switch to log_file if specified",
238 | default=None)
239 | parser.add_argument("--folder_to_save",
240 | help="Folder to store results. Default: temp",
241 | default="temp")
242 | parser.add_argument("--json_convert",
243 | help="File to convert",
244 | default="deoxi_07_no_H/deoxiviolacein_iteration_85.json")
245 | parser.add_argument("--file_name", help = 'File name if name changes.', default = None)
246 | args = parser.parse_args()
247 | # Setting up the logs
248 | if args.verbose:
249 | logging_level = logging.DEBUG
250 | else:
251 | logging_level = logging.INFO
252 | if args.log_file is None:
253 | logging.basicConfig(stream=sys.stderr,
254 | level=logging_level,
255 | datefmt='%d/%m/%Y %H:%M:%S',
256 | format='%(asctime)s -- %(levelname)s -- %(message)s')
257 | else:
258 | if not "log" in args.log_file:
259 | log_file = "log_" + args.log_file
260 | else:
261 | log_file = args.log_file
262 | log_writer = open("{}/{}".format(folder_to_save, log_file), "w")
263 | logging.basicConfig(stream=log_writer,
264 | level=logging_level,
265 | datefmt='%d/%m/%Y %H:%M:%S',
266 | format='%(asctime)s -- %(levelname)s -- %(message)s')
267 |
268 | folder_to_save = define_folder_to_save(args.folder_to_save)
269 | # Choosing file
270 | if args.file_name is None:
271 | model_ID = args.json_convert.split("/")[-1].split(".json")[0]
272 | else:
273 | model_ID = args.file_name
274 | pathway_to_test = json.load(open(args.json_convert, "r"))
275 | convert_json_to_SBML(pathway_to_test, model_ID, folder_to_save = folder_to_save)
276 |
277 | if __name__ == "__main__":
278 | __cli()
279 |
--------------------------------------------------------------------------------
/data/base_config.py:
--------------------------------------------------------------------------------
1 | """
2 | the aim of this file is to store configuration parameters, notably for the DB.
3 | It replaces what I previously wanted to define as 'global'
4 | """
5 | try:
6 | from rp3_dcache.Manager import Manager # In house module
7 | from rp3_dcache.Utils import make_document_id, as_document, rdmols_from_document
8 | dcache_installed = True
9 | except ModuleNotFoundError:
10 | dcache_installed = False
11 | import logging
12 | import os
13 |
14 | # Files and addresses configurations - should not be modified:
15 | global DATA_PATH
16 | DATA_PATH = "{}/data".format(os.path.dirname(os.path.abspath(__file__)))
17 |
18 | global add_Hs
19 | add_Hs = True
20 | hydrogen_config = "Using explicit hydrogens : {}".format(add_Hs)
21 |
22 | # Database for storing results configuration
23 | global DB_CACHE
24 | global DB_REPLACE
25 | DB_CACHE = False and dcache_installed
26 | DB_REPLACE = False and dcache_installed
27 | DB_time = 0
28 | if DB_CACHE:
29 | global CACHE_MGR
30 | if add_Hs:
31 | CACHE_MGR = Manager(replace=DB_REPLACE, collection = "results_with_H")
32 | else:
33 | CACHE_MGR = Manager(replace=DB_REPLACE, collection = "results_without_H")
34 | CACHE_MGR.connect()
35 | DB_config = "Setting the DB from config file: Installed package: {}. Using cache DB: {}; Replacing results: {}".format(dcache_installed, DB_CACHE, DB_REPLACE)
36 | elif dcache_installed:
37 | DB_config = "Setting the DB from config file: Installed package: {}. Using cache DB: {}; Replacing results: {}".format(dcache_installed, DB_CACHE, DB_REPLACE)
38 | else:
39 | DB_config = "Setting the DB from config file: Installed package: {}".format(dcache_installed)
40 |
41 | # Mode for using RP3: retrosynthesis or biosensor. QSAR might be implemented one day.
42 | global retrosynthesis
43 | global biosensor
44 | retrosynthesis = True
45 | biosensor = False
46 | tree_mode_config = "Using retrosynthesis: {} - using biosensor {}".format(retrosynthesis, biosensor)
47 |
48 | # Configuring local cache. Could be replaced by a proper caching system one day.
49 | global home_made_cache
50 | home_made_cache = {}
51 |
52 | global use_cache
53 | use_cache = True
54 |
55 | cache_config = "Initialising an empty cache: {}; Using it: {}".format(home_made_cache, use_cache)
56 |
57 | # MCTS parameters for configuration
58 |
59 | global transposition_table
60 | global use_transpositions
61 | global use_transpositions_depth
62 |
63 | transposition_table = {}
64 | use_transpositions = False
65 | use_transpositions_depth = False
66 |
67 | transposition_table_config = "Using transposition tables: {}. With depth: {}".format(use_transpositions, use_transpositions_depth)
68 |
69 | # For toxicity, using log(IC50) as penaly when below 0.
70 | global use_toxicity
71 | try:
72 | import sklearn
73 | from sklearn.neural_network import MLPRegressor
74 | sklearn_here = True
75 | except ModuleNotFoundError:
76 | toxicity_config = "Toxicity will not be enabled because sklearn is not installed"
77 | sklearn_here = False
78 | use_toxicity = False
79 | use_toxicity = use_toxicity and sklearn_here
80 |
--------------------------------------------------------------------------------
/data/compounds_to_add/TPA_to_add.csv:
--------------------------------------------------------------------------------
1 | name,inchi
2 | MNXM162174,"InChI=1S/C8H10/c1-7-3-5-8(2)6-4-7/h3-6H,1-2H3"
3 |
--------------------------------------------------------------------------------
/data/golden_dataset.csv:
--------------------------------------------------------------------------------
1 | name inchi file_to_add
2 | 1,4-Butanediol InChI=1S/C4H10O2/c5-3-1-2-4-6/h5-6H,1-4H2
3 | 2,3-amino-1,3-propanediol InChI=1S/C3H9NO2/c4-3(1-5)2-6/h3,5-6H,1-2,4H2
4 | 2,5-DHBA InChI=1S/C7H6O4/c8-4-1-2-6(9)5(3-4)7(10)11/h1-3,8-9H,(H,10,11)
5 | 3-methylbutanol InChI=1S/C5H12O/c1-5(2)3-4-6/h5-6H,3-4H2,1-2H3
6 | N-methylpyrrolinium InChI=1S/C5H10N/c1-6-4-2-3-5-6/h4H,2-3,5H2,1H3/q+1
7 | benzyl_alcohol InChI=1S/C7H8O/c8-6-7-4-2-1-3-5-7/h1-5,8H,6H2
8 | caroten InChI=1S/C40H56/c1-31(19-13-21-33(3)25-27-37-35(5)23-15-29-39(37,7)8)17-11-12-18-32(2)20-14-22-34(4)26-28-38-36(6)24-16-30-40(38,9)10/h11-14,17-22,25-28H,15-16,23-24,29-30H2,1-10H3/b12-11+,19-13+,20-14+,27-25+,28-26+,31-17+,32-18+,33-21+,34-22+
9 | cis,cis-muconate InChI=1S/C6H6O4/c7-5(8)3-1-2-4-6(9)10/h1-4H,(H,7,8)(H,9,10)/p-2/b3-1-,4-2-
10 | violacein InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+
11 | glutaric_acid InChI=1S/C5H8O4/c6-4(7)2-1-3-5(8)9/h1-3H2,(H,6,7)(H,8,9)
12 | mesaconic_acid InChI=1S/C5H6O4/c1-3(5(8)9)2-4(6)7/h2H,1H3,(H,6,7)(H,8,9)/b3-2+
13 | naringenin InChI=1S/C15H12O5/c16-9-3-1-8(2-4-9)13-7-12(19)15-11(18)5-10(17)6-14(15)20-13/h1-6,13,16-18H,7H2
14 | p-hydroxystyrene InChI=1S/C8H8O/c1-2-7-3-5-8(9)6-4-7/h2-6,9H,1H2
15 | piceatannol InChI=1S/C14H12O4/c15-11-5-10(6-12(16)8-11)2-1-9-3-4-13(17)14(18)7-9/h1-8,15-18H/b2-1+
16 | protopanaxadiol InChI=1S/C30H52O3/c1-19(2)10-9-14-30(8,33)20-11-16-29(7)25(20)21(31)18-23-27(5)15-13-24(32)26(3,4)22(27)12-17-28(23,29)6/h10,20-25,31-33H,9,11-18H2,1-8H3/t20-,21+,22-,23+,24-,25-,27-,28+,29+,30+/m0/s1
17 | TPA InChI=1S/C8H6O4/c9-7(10)5-1-2-6(4-3-5)8(11)12/h1-4H,(H,9,10)(H,11,12) clean_data/compounds_to_add/TPA_to_add.csv
18 | vanillin InChI=1S/C8H8O3/c1-11-8-4-6(5-9)2-3-7(8)10/h2-5,10H,1H3
19 | lycopene InChI=1S/C40H56/c1-33(2)19-13-23-37(7)27-17-31-39(9)29-15-25-35(5)21-11-12-22-36(6)26-16-30-40(10)32-18-28-38(8)24-14-20-34(3)4/h11-12,15-22,25-32H,13-14,23-24H2,1-10H3/b12-11+,25-15+,26-16+,31-17+,32-18+,35-21+,36-22+,37-27+,38-28+,39-29+,40-30+
20 | pinocembrin InChI=1S/C15H12O4/c16-10-6-11(17)15-12(18)8-13(19-14(15)7-10)9-4-2-1-3-5-9/h1-7,13,16-17H,8H2/t13-/m0/s1
21 | styrene InChI=1S/C8H8/c1-2-8-6-4-3-5-7-8/h2-7H,1H2
22 |
--------------------------------------------------------------------------------
/data/sinks/ecoli_core_sink_reduced_rp_ready.csv:
--------------------------------------------------------------------------------
1 | "name","inchi"
2 | "2-Oxoglutarate","InChI=1S/C5H6O5/c6-3(5(9)10)1-2-4(7)8/h1-2H2,(H,7,8)(H,9,10)"
3 | "3-Phospho-D-glycerate","InChI=1S/C3H7O7P/c4-2(3(5)6)1-10-11(7,8)9/h2,4H,1H2,(H,5,6)(H2,7,8,9)"
4 | "3-Phospho-D-glyceroyl phosphate","InChI=1S/C3H8O10P2/c4-2(1-12-14(6,7)8)3(5)13-15(9,10)11/h2,4H,1H2,(H2,6,7,8)(H2,9,10,11)"
5 | "6-Phospho-D-gluconate","InChI=1S/C6H13O10P/c7-2(1-16-17(13,14)15)3(8)4(9)5(10)6(11)12/h2-5,7-10H,1H2,(H,11,12)(H2,13,14,15)"
6 | "6-phospho-D-glucono-1,5-lactone","InChI=1S/C6H11O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-5,7-9H,1H2,(H2,11,12,13)"
7 | "ADP C10H12N5O10P2","InChI=1S/C10H15N5O10P2/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(24-10)1-23-27(21,22)25-26(18,19)20/h2-4,6-7,10,16-17H,1H2,(H,21,22)(H2,11,12,13)(H2,18,19,20)"
8 | "AMP C10H12N5O7P","InChI=1S/C10H14N5O7P/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(22-10)1-21-23(18,19)20/h2-4,6-7,10,16-17H,1H2,(H2,11,12,13)(H2,18,19,20)"
9 | "ATP C10H12N5O13P3","InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(3-14-5)10-7(17)6(16)4(26-10)1-25-30(21,22)28-31(23,24)27-29(18,19)20/h2-4,6-7,10,16-17H,1H2,(H,21,22)(H,23,24)(H2,11,12,13)(H2,18,19,20)"
10 | "Acetaldehyde","InChI=1S/C2H4O/c1-2-3/h2H,1H3"
11 | "Acetate","InChI=1S/C2H4O2/c1-2(3)4/h1H3,(H,3,4)"
12 | "Acetyl phosphate","InChI=1S/C2H5O5P/c1-2(3)7-8(4,5)6/h1H3,(H2,4,5,6)"
13 | "Acetyl-CoA","InChI=1S/C23H38N7O17P3S/c1-12(31)51-7-6-25-14(32)4-5-26-21(35)18(34)23(2,3)9-44-50(41,42)47-49(39,40)43-8-13-17(46-48(36,37)38)16(33)22(45-13)30-11-29-15-19(24)27-10-28-20(15)30/h10-11,13,16-18,22,33-34H,4-9H2,1-3H3,(H,25,32)(H,26,35)(H,39,40)(H,41,42)(H2,24,27,28)(H2,36,37,38)"
14 | "Alpha-D-Ribose 5-phosphate","InChI=1S/C5H11O8P/c6-3-2(1-12-14(9,10)11)13-5(8)4(3)7/h2-8H,1H2,(H2,9,10,11)"
15 | "Ammonium","InChI=1S/H3N/h1H3"
16 | "CO2 CO2","InChI=1S/CO2/c2-1-3"
17 | "Cis-Aconitate","InChI=1S/C6H6O6/c7-4(8)1-3(6(11)12)2-5(9)10/h1H,2H2,(H,7,8)(H,9,10)(H,11,12)"
18 | "Citrate","InChI=1S/C6H8O7/c7-3(8)1-6(13,5(11)12)2-4(9)10/h13H,1-2H2,(H,7,8)(H,9,10)(H,11,12)"
19 | "Coenzyme A","InChI=1S/C21H36N7O16P3S/c1-21(2,16(31)19(32)24-4-3-12(29)23-5-6-48)8-41-47(38,39)44-46(36,37)40-7-11-15(43-45(33,34)35)14(30)20(42-11)28-10-27-13-17(22)25-9-26-18(13)28/h9-11,14-16,20,30-31,48H,3-8H2,1-2H3,(H,23,29)(H,24,32)(H,36,37)(H,38,39)(H2,22,25,26)(H2,33,34,35)"
20 | "D-Erythrose 4-phosphate","InChI=1S/C4H9O7P/c5-1-3(6)4(7)2-11-12(8,9)10/h1,3-4,6-7H,2H2,(H2,8,9,10)"
21 | "D-Fructose 1,6-bisphosphate","InChI=1S/C6H14O12P2/c7-4-3(1-16-19(10,11)12)18-6(9,5(4)8)2-17-20(13,14)15/h3-5,7-9H,1-2H2,(H2,10,11,12)(H2,13,14,15)"
22 | "D-Fructose 6-phosphate","InChI=1S/C6H13O9P/c7-2-6(10)5(9)4(8)3(15-6)1-14-16(11,12)13/h3-5,7-10H,1-2H2,(H2,11,12,13)"
23 | "D-Glucose 6-phosphate","InChI=1S/C6H13O9P/c7-3-2(1-14-16(11,12)13)15-6(10)5(9)4(3)8/h2-10H,1H2,(H2,11,12,13)"
24 | "D-Glycerate 2-phosphate","InChI=1S/C3H7O7P/c4-1-2(3(5)6)10-11(7,8)9/h2,4H,1H2,(H,5,6)(H2,7,8,9)"
25 | "D-Lactate","InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)"
26 | "D-Ribulose 5-phosphate","InChI=1S/C5H11O8P/c6-1-3(7)5(9)4(8)2-13-14(10,11)12/h4-6,8-9H,1-2H2,(H2,10,11,12)"
27 | "D-Xylulose 5-phosphate","InChI=1S/C5H11O8P/c6-1-3(7)5(9)4(8)2-13-14(10,11)12/h4-6,8-9H,1-2H2,(H2,10,11,12)"
28 | "Dihydroxyacetone phosphate","InChI=1S/C3H7O6P/c4-1-3(5)2-9-10(6,7)8/h4H,1-2H2,(H2,6,7,8)"
29 | "Ethanol","InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3"
30 | "Formate","InChI=1S/CH2O2/c2-1-3/h1H,(H,2,3)"
31 | "Fumarate","InChI=1S/C4H4O4/c5-3(6)1-2-4(7)8/h1-2H,(H,5,6)(H,7,8)"
32 | "Glyceraldehyde 3-phosphate","InChI=1S/C3H7O6P/c4-1-3(5)2-9-10(6,7)8/h1,3,5H,2H2,(H2,6,7,8)"
33 | "Glyoxylate","InChI=1S/C2H2O3/c3-1-2(4)5/h1H,(H,4,5)"
34 | "H+","InChI=1S/p+1"
35 | "H2O H2O","InChI=1S/H2O/h1H2"
36 | "Isocitrate","InChI=1S/C6H8O7/c7-3(8)1-2(5(10)11)4(9)6(12)13/h2,4,9H,1H2,(H,7,8)(H,10,11)(H,12,13)"
37 | "L-Glutamate","InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H,7,8)(H,9,10)"
38 | "L-Glutamine","InChI=1S/C5H10N2O3/c6-3(5(9)10)1-2-4(7)8/h3H,1-2,6H2,(H2,7,8)(H,9,10)"
39 | "L-Malate","InChI=1S/C4H6O5/c5-2(4(8)9)1-3(6)7/h2,5H,1H2,(H,6,7)(H,8,9)"
40 | "Nicotinamide adenine dinucleotide","InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(32)14(30)11(41-21)6-39-44(36,37)42-43(34,35)38-5-10-13(29)15(31)20(40-10)27-3-1-2-9(4-27)18(23)33/h1-4,7-8,10-11,13-16,20-21,29-32H,5-6H2,(H5-,22,23,24,25,33,34,35,36,37)/p+1"
41 | "Nicotinamide adenine dinucleotide - reduced","InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(32)14(30)11(41-21)6-39-44(36,37)42-43(34,35)38-5-10-13(29)15(31)20(40-10)27-3-1-2-9(4-27)18(23)33/h1,3-4,7-8,10-11,13-16,20-21,29-32H,2,5-6H2,(H2,23,33)(H,34,35)(H,36,37)(H2,22,24,25)"
42 | "Nicotinamide adenine dinucleotide phosphate","InChI=1S/C21H28N7O17P3/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(44-46(33,34)35)14(30)11(43-21)6-41-48(38,39)45-47(36,37)40-5-10-13(29)15(31)20(42-10)27-3-1-2-9(4-27)18(23)32/h1-4,7-8,10-11,13-16,20-21,29-31H,5-6H2,(H7-,22,23,24,25,32,33,34,35,36,37,38,39)/p+1"
43 | "Nicotinamide adenine dinucleotide phosphate - reduced","InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17)28(8-26-12)21-16(44-46(33,34)35)14(30)11(43-21)6-41-48(38,39)45-47(36,37)40-5-10-13(29)15(31)20(42-10)27-3-1-2-9(4-27)18(23)32/h1,3-4,7-8,10-11,13-16,20-21,29-31H,2,5-6H2,(H2,23,32)(H,36,37)(H,38,39)(H2,22,24,25)(H2,33,34,35)"
44 | "O2 O2","InChI=1S/O2/c1-2"
45 | "Oxaloacetate","InChI=1S/C4H4O5/c5-2(4(8)9)1-3(6)7/h1H2,(H,6,7)(H,8,9)"
46 | "Phosphate","InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)"
47 | "Phosphoenolpyruvate","InChI=1S/C3H5O6P/c1-2(3(4)5)9-10(6,7)8/h1H2,(H,4,5)(H2,6,7,8)"
48 | "Pyruvate","InChI=1S/C3H4O3/c1-2(4)3(5)6/h1H3,(H,5,6)"
49 | "Sedoheptulose 7-phosphate","InChI=1S/C7H15O10P/c8-1-3(9)5(11)7(13)6(12)4(10)2-17-18(14,15)16/h4-8,10-13H,1-2H2,(H2,14,15,16)"
50 | "Succinate","InChI=1S/C4H6O4/c5-3(6)1-2-4(7)8/h1-2H2,(H,5,6)(H,7,8)"
51 | "Succinyl-CoA","InChI=1S/C25H40N7O19P3S/c1-25(2,20(38)23(39)28-6-5-14(33)27-7-8-55-16(36)4-3-15(34)35)10-48-54(45,46)51-53(43,44)47-9-13-19(50-52(40,41)42)18(37)24(49-13)32-12-31-17-21(26)29-11-30-22(17)32/h11-13,18-20,24,37-38H,3-10H2,1-2H3,(H,27,33)(H,28,39)(H,34,35)(H,43,44)(H,45,46)(H2,26,29,30)(H2,40,41,42)"
52 | "Ubiquinol-8","InChI=1S/C49H76O4/c1-36(2)20-13-21-37(3)22-14-23-38(4)24-15-25-39(5)26-16-27-40(6)28-17-29-41(7)30-18-31-42(8)32-19-33-43(9)34-35-45-44(10)46(50)48(52-11)49(53-12)47(45)51/h20,22,24,26,28,30,32,34,50-51H,13-19,21,23,25,27,29,31,33,35H2,1-12H3"
53 | "Ubiquinone-8","InChI=1S/C49H74O4/c1-36(2)20-13-21-37(3)22-14-23-38(4)24-15-25-39(5)26-16-27-40(6)28-17-29-41(7)30-18-31-42(8)32-19-33-43(9)34-35-45-44(10)46(50)48(52-11)49(53-12)47(45)51/h20,22,24,26,28,30,32,34H,13-19,21,23,25,27,29,31,33,35H2,1-12H3"
54 |
--------------------------------------------------------------------------------
/data/supplement_finder/data/metanetx_extracted_inchikeys.json.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/data/supplement_finder/data/metanetx_extracted_inchikeys.json.tar.gz
--------------------------------------------------------------------------------
/data/supplement_finder/tree_for_testing/TPA/pickles/tree_end_search.pkl.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/data/supplement_finder/tree_for_testing/TPA/pickles/tree_end_search.pkl.tar.gz
--------------------------------------------------------------------------------
/data/supplement_finder/tree_for_testing/morphine/pickles/tree_end_search.pkl.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/data/supplement_finder/tree_for_testing/morphine/pickles/tree_end_search.pkl.tar.gz
--------------------------------------------------------------------------------
/document_all_options.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 |
3 | The aim of this file is to document all options available to run the MCTS and where to find them.
4 | More details are in the attached paper at https://doi.org/10.1101/800474 , especially in the Appendix.
5 |
6 | ### Global configuration options
7 |
8 | - where: in the config.py file
9 | - how: either by modying the config.py file by hand or by running the change_config.py with its argparser (recommanded)
10 |
11 | - DB_CACHE: uses the MongoDB cache when activated
12 | - DB_REPLACE: replaces data in the Mongo DB cache when activated
13 | - DB_time: time cut_off for loading in the DB: stored only if above the cut-off, otherwise the rule is applied by Python
14 | - use_cache: dictionnary for caching results within the script. Highly recommanded.
15 | - retrosynthesis: performs a retrosynthetic search; biosensor: performs a biosensor search
16 | Both cannot be activated at the same time.
17 | Main difference is how the state is considered sucessful: all compounds have to be found for retrosynthesis, and only 1 for biosensors
18 | - add_Hs: explicit hydrogens. Recommanded at false for faster calculations.
19 | - use_transpositions and transposition depth: not stable. Allow for sharing of information between nodes with the same chemical state but at different places in the tree, as done in doi:10.1007/BF03192151.
20 |
21 | ### Tree search configuration:
22 |
23 | - stop_at_first_result: stops once a signle pathway is found.
24 | - c_name, c_smile,s c_inchi: information on the chemical compound of interest
25 | - fire_timeout, standardisation_timeout: time allowed for firing a rule/standardising a compound
26 | - organism_name: which model to use for production of compounds
27 | - complementary_sink: csv file containing compounds to add to the sink. If organism is None, is the full sink.
28 | - representation: how to print results in logs
29 | - itermax: maximum number of iterations allowed for running the Tree search
30 | - parallel: not possible to use at the moment due to workaround for RDKit rule application. Aimed at parallelising rollouts.
31 | - expansion_width: maximum number of children per node
32 | - time budget: time allowed for running the tree search. The search will stop at the end of the iteration that exceeds this allotted time
33 | - max_depth: maximum depth of the Tree (also the maximum number of pathway steps)
34 | - minimal_visit_counts: Minimal number of times a node has to be rolled out before his brothers can be expanded
35 | - UCT_policy: define the UCT policy to use, ie the way to rank the children of a node. Allows various bias, and scoring considerations.
36 | - UCTK: the constant defining the exploration/exploitation parameter in the UCT formula
37 | - bias_k: if progressive bias is used, define the weight of the progressive bias in the UCT formula
38 | - k_rave: if RAVE is used, how to weight the RAVE. Roughly for visits below this value RAVE values lead the UCT and above, rollout values lead.
39 | - use_RAVE: moves have scores each time they are used throughout the Tree, adapting RAVE (Rapid Action Value Estimation) principle to the whole tree and not just rollouts.
40 | - penalty: penalty when no compound of the state belongs to the organism
41 | - full_state_reward: reward when all compounds of the state belongs to the organism
42 | - pathway_scoring: how to score a pathway when it is found.
43 | - Rollout_policy: how to select moves for the Rollout: randomly, wieghting by which scores. Many options.
44 | - max_rollout: maximum length of the rollout (it also stops when max_depth is reached)
45 | - chemical_scoring: chose the way to chemically score reactions (considering only substrates or both substrates and products). Possibility to use ConstantChemicalScorer which always returns 1.
46 | - biological_score_cut_off: cuts off with biological score at the specified level
47 | - substrate_only_score_cut_off: cuts off with substrate similarity only score BEFORE applying the rule at the specified level
48 | - chemical_score_cut_off: cuts off with specified chemical score AFTER applying the rule at the specified level
49 | - virtual_visits: start nodes at virtual_visits values, to avoid stochasticity at initial simulations; used to avoid having to much variability at initial Monte Carlo simulations.
50 | - progressive_bias_strategy: policy for the progressive bias (untested)
51 | - progressive widening: add a child to nodes visited more than len(nodes)^2 (untested)
52 | - diameter: Speficy the diameters (as list) to use
53 | - EC_filter: allow only certain EC subclasses
54 | - small: development archive
55 | - seed: for reproducibility
56 | - tree_to_complete: if restarting the search from another tree.
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/expected_results/deoxiviolacein_1.json:
--------------------------------------------------------------------------------
1 | {
2 | "elements": {
3 | "nodes": [
4 | {
5 | "data": {
6 | "SMILES": "NC(Cc1c[nH]c2ccccc12)C(=O)O",
7 | "inSink": 1,
8 | "isSource": 0,
9 | "InChI": "InChI=1S/C11H12N2O2/c12-9(11(14)15)5-7-6-13-10-4-2-1-3-8(7)10/h1-4,6,9,13H,5,12H2,(H,14,15)",
10 | "Names": [
11 | "QIVBCDIJIAJPQS-UHFFFAOYSA-N",
12 | "L-Tryptophan"
13 | ],
14 | "id": "QIVBCDIJIAJPQS-UHFFFAOYSA-N",
15 | "type": "compound",
16 | "Rule ID": null,
17 | "EC number": null,
18 | "Reaction SMILES": null,
19 | "Diameter": null,
20 | "Score": null,
21 | "Iteration": null
22 | }
23 | },
24 | {
25 | "data": {
26 | "SMILES": "N=C(Cc1c[nH]c2ccccc12)C(=O)O",
27 | "inSink": 0,
28 | "isSource": 0,
29 | "InChI": "InChI=1S/C11H10N2O2/c12-9(11(14)15)5-7-6-13-10-4-2-1-3-8(7)10/h1-4,6,12-13H,5H2,(H,14,15)",
30 | "Names": [
31 | "LKYWXXAVLLVJAS-UHFFFAOYSA-N"
32 | ],
33 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N",
34 | "type": "compound",
35 | "Rule ID": null,
36 | "EC number": null,
37 | "Reaction SMILES": null,
38 | "Diameter": null,
39 | "Score": null,
40 | "Iteration": null
41 | }
42 | },
43 | {
44 | "data": {
45 | "SMILES": "N=C(C(=O)O)C(c1c[nH]c2ccccc12)C(C(=N)C(=O)O)c1c[nH]c2ccccc12",
46 | "inSink": 0,
47 | "isSource": 0,
48 | "InChI": "InChI=1S/C22H18N4O4/c23-19(21(27)28)17(13-9-25-15-7-3-1-5-11(13)15)18(20(24)22(29)30)14-10-26-16-8-4-2-6-12(14)16/h1-10,17-18,23-26H,(H,27,28)(H,29,30)",
49 | "Names": [
50 | "CKBGWXPNAUCVQQ-UHFFFAOYSA-N"
51 | ],
52 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N",
53 | "type": "compound",
54 | "Rule ID": null,
55 | "EC number": null,
56 | "Reaction SMILES": null,
57 | "Diameter": null,
58 | "Score": null,
59 | "Iteration": null
60 | }
61 | },
62 | {
63 | "data": {
64 | "SMILES": "O=O",
65 | "inSink": 1,
66 | "isSource": 0,
67 | "InChI": "InChI=1S/O2/c1-2",
68 | "Names": [
69 | "MYMOFIZGZYHOMD-UHFFFAOYSA-N",
70 | "O2 O2"
71 | ],
72 | "id": "MYMOFIZGZYHOMD-UHFFFAOYSA-N",
73 | "type": "compound",
74 | "Rule ID": null,
75 | "EC number": null,
76 | "Reaction SMILES": null,
77 | "Diameter": null,
78 | "Score": null,
79 | "Iteration": null
80 | }
81 | },
82 | {
83 | "data": {
84 | "SMILES": "O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12",
85 | "inSink": 0,
86 | "isSource": 0,
87 | "InChI": "InChI=1S/C21H15N3O2/c25-21(26)20-14(15-10-22-17-7-3-1-5-12(15)17)9-19(24-20)16-11-23-18-8-4-2-6-13(16)18/h1-11,22-24H,(H,25,26)",
88 | "Names": [
89 | "SFLGFRJGKHRRID-UHFFFAOYSA-N"
90 | ],
91 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N",
92 | "type": "compound",
93 | "Rule ID": null,
94 | "EC number": null,
95 | "Reaction SMILES": null,
96 | "Diameter": null,
97 | "Score": null,
98 | "Iteration": null
99 | }
100 | },
101 | {
102 | "data": {
103 | "SMILES": "O=C1NC(c2c[nH]c3ccccc23)=CC1=C1C(=O)Nc2ccccc21",
104 | "inSink": 0,
105 | "isSource": 1,
106 | "InChI": "InChI=1S/C20H13N3O2/c24-19-13(18-12-6-2-4-8-16(12)22-20(18)25)9-17(23-19)14-10-21-15-7-3-1-5-11(14)15/h1-10,21H,(H,22,25)(H,23,24)",
107 | "Names": [
108 | "deoxiviolacein",
109 | "OJUJNNKCVPCATE-UHFFFAOYSA-N"
110 | ],
111 | "id": "OJUJNNKCVPCATE-UHFFFAOYSA-N",
112 | "type": "compound",
113 | "Rule ID": null,
114 | "EC number": null,
115 | "Reaction SMILES": null,
116 | "Diameter": null,
117 | "Score": null,
118 | "Iteration": null
119 | }
120 | },
121 | {
122 | "data": {
123 | "SMILES": null,
124 | "inSink": null,
125 | "isSource": null,
126 | "InChI": null,
127 | "Names": null,
128 | "id": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
129 | "type": "reaction",
130 | "Rule ID": [
131 | "RR-02-8907c369787578b3-16-F",
132 | "RR-02-8907c369787578b3-14-F",
133 | "RR-02-8907c369787578b3-12-F",
134 | "RR-02-8907c369787578b3-10-F"
135 | ],
136 | "EC number": [
137 | "1.14.13.224"
138 | ],
139 | "Reaction SMILES": "O=C1NC(c2c[nH]c3ccccc23)=CC1=C1C(=O)Nc2ccccc21>>O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12.O=O",
140 | "Diameter": 16,
141 | "Score": 1.0,
142 | "ChemicalScore": 1.0,
143 | "Iteration": 1,
144 | "Stoechiometry": {
145 | "SFLGFRJGKHRRID-UHFFFAOYSA-N": 1,
146 | "MYMOFIZGZYHOMD-UHFFFAOYSA-N": 1
147 | }
148 | }
149 | },
150 | {
151 | "data": {
152 | "SMILES": null,
153 | "inSink": null,
154 | "isSource": null,
155 | "InChI": null,
156 | "Names": null,
157 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2",
158 | "type": "reaction",
159 | "Rule ID": [
160 | "RR-02-74068b9f6b2efdc1-16-F",
161 | "RR-02-74068b9f6b2efdc1-14-F",
162 | "RR-02-74068b9f6b2efdc1-12-F",
163 | "RR-02-74068b9f6b2efdc1-10-F"
164 | ],
165 | "EC number": [
166 | ""
167 | ],
168 | "Reaction SMILES": "O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12>>N=C(C(=O)O)C(c1c[nH]c2ccccc12)C(C(=N)C(=O)O)c1c[nH]c2ccccc12",
169 | "Diameter": 16,
170 | "Score": 1.0,
171 | "ChemicalScore": 1.0,
172 | "Iteration": 2,
173 | "Stoechiometry": {
174 | "CKBGWXPNAUCVQQ-UHFFFAOYSA-N": 1
175 | }
176 | }
177 | },
178 | {
179 | "data": {
180 | "SMILES": null,
181 | "inSink": null,
182 | "isSource": null,
183 | "InChI": null,
184 | "Names": null,
185 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3",
186 | "type": "reaction",
187 | "Rule ID": [
188 | "RR-02-47e9577f4cb98f97-16-F",
189 | "RR-02-47e9577f4cb98f97-14-F",
190 | "RR-02-47e9577f4cb98f97-12-F",
191 | "RR-02-47e9577f4cb98f97-10-F"
192 | ],
193 | "EC number": [
194 | "1.21.98"
195 | ],
196 | "Reaction SMILES": "N=C(C(=O)O)C(c1c[nH]c2ccccc12)C(C(=N)C(=O)O)c1c[nH]c2ccccc12>>N=C(Cc1c[nH]c2ccccc12)C(=O)O.N=C(Cc1c[nH]c2ccccc12)C(=O)O",
197 | "Diameter": 16,
198 | "Score": 1.0,
199 | "ChemicalScore": 1.0,
200 | "Iteration": 3,
201 | "Stoechiometry": {
202 | "LKYWXXAVLLVJAS-UHFFFAOYSA-N": 2
203 | }
204 | }
205 | },
206 | {
207 | "data": {
208 | "SMILES": null,
209 | "inSink": null,
210 | "isSource": null,
211 | "InChI": null,
212 | "Names": null,
213 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4",
214 | "type": "reaction",
215 | "Rule ID": [
216 | "RR-02-0c9c5a5559e132c7-16-F",
217 | "RR-02-0c9c5a5559e132c7-14-F",
218 | "RR-02-0c9c5a5559e132c7-12-F",
219 | "RR-02-bbedd3c9b9124d30-10-F"
220 | ],
221 | "EC number": [
222 | "1.3.3.10",
223 | "1.4.3",
224 | "1.4.3.-"
225 | ],
226 | "Reaction SMILES": "N=C(Cc1c[nH]c2ccccc12)C(=O)O>>NC(Cc1c[nH]c2ccccc12)C(=O)O",
227 | "Diameter": 16,
228 | "Score": 0.453552175675181,
229 | "ChemicalScore": 1.0,
230 | "Iteration": 4,
231 | "Stoechiometry": {
232 | "QIVBCDIJIAJPQS-UHFFFAOYSA-N": 1
233 | }
234 | }
235 | }
236 | ],
237 | "edges": [
238 | {
239 | "data": {
240 | "target": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
241 | "source": "OJUJNNKCVPCATE-UHFFFAOYSA-N",
242 | "id": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1_=>_OJUJNNKCVPCATE-UHFFFAOYSA-N"
243 | }
244 | },
245 | {
246 | "data": {
247 | "target": "SFLGFRJGKHRRID-UHFFFAOYSA-N",
248 | "source": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
249 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N_=>_OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1"
250 | }
251 | },
252 | {
253 | "data": {
254 | "target": "MYMOFIZGZYHOMD-UHFFFAOYSA-N",
255 | "source": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
256 | "id": "MYMOFIZGZYHOMD-UHFFFAOYSA-N_=>_OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1"
257 | }
258 | },
259 | {
260 | "data": {
261 | "target": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2",
262 | "source": "SFLGFRJGKHRRID-UHFFFAOYSA-N",
263 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2_=>_SFLGFRJGKHRRID-UHFFFAOYSA-N"
264 | }
265 | },
266 | {
267 | "data": {
268 | "target": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N",
269 | "source": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2",
270 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N_=>_SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2"
271 | }
272 | },
273 | {
274 | "data": {
275 | "target": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3",
276 | "source": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N",
277 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3_=>_CKBGWXPNAUCVQQ-UHFFFAOYSA-N"
278 | }
279 | },
280 | {
281 | "data": {
282 | "target": "LKYWXXAVLLVJAS-UHFFFAOYSA-N",
283 | "source": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3",
284 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N_=>_CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3"
285 | }
286 | },
287 | {
288 | "data": {
289 | "target": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4",
290 | "source": "LKYWXXAVLLVJAS-UHFFFAOYSA-N",
291 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4_=>_LKYWXXAVLLVJAS-UHFFFAOYSA-N"
292 | }
293 | },
294 | {
295 | "data": {
296 | "target": "QIVBCDIJIAJPQS-UHFFFAOYSA-N",
297 | "source": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4",
298 | "id": "QIVBCDIJIAJPQS-UHFFFAOYSA-N_=>_LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4"
299 | }
300 | }
301 | ]
302 | }
303 | }
--------------------------------------------------------------------------------
/expected_results/deoxiviolacein_iteration_15.json:
--------------------------------------------------------------------------------
1 | {
2 | "elements": {
3 | "nodes": [
4 | {
5 | "data": {
6 | "SMILES": "NC(Cc1c[nH]c2ccccc12)C(=O)O",
7 | "inSink": 1,
8 | "isSource": 0,
9 | "InChI": "InChI=1S/C11H12N2O2/c12-9(11(14)15)5-7-6-13-10-4-2-1-3-8(7)10/h1-4,6,9,13H,5,12H2,(H,14,15)",
10 | "Names": [
11 | "QIVBCDIJIAJPQS-UHFFFAOYSA-N",
12 | "L-Tryptophan"
13 | ],
14 | "id": "QIVBCDIJIAJPQS-UHFFFAOYSA-N",
15 | "type": "compound",
16 | "Rule ID": null,
17 | "EC number": null,
18 | "Reaction SMILES": null,
19 | "Diameter": null,
20 | "Score": null,
21 | "Iteration": null
22 | }
23 | },
24 | {
25 | "data": {
26 | "SMILES": "N=C(Cc1c[nH]c2ccccc12)C(=O)O",
27 | "inSink": 0,
28 | "isSource": 0,
29 | "InChI": "InChI=1S/C11H10N2O2/c12-9(11(14)15)5-7-6-13-10-4-2-1-3-8(7)10/h1-4,6,12-13H,5H2,(H,14,15)",
30 | "Names": [
31 | "LKYWXXAVLLVJAS-UHFFFAOYSA-N"
32 | ],
33 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N",
34 | "type": "compound",
35 | "Rule ID": null,
36 | "EC number": null,
37 | "Reaction SMILES": null,
38 | "Diameter": null,
39 | "Score": null,
40 | "Iteration": null
41 | }
42 | },
43 | {
44 | "data": {
45 | "SMILES": "N=C(C(=O)O)C(c1c[nH]c2ccccc12)C(C(=N)C(=O)O)c1c[nH]c2ccccc12",
46 | "inSink": 0,
47 | "isSource": 0,
48 | "InChI": "InChI=1S/C22H18N4O4/c23-19(21(27)28)17(13-9-25-15-7-3-1-5-11(13)15)18(20(24)22(29)30)14-10-26-16-8-4-2-6-12(14)16/h1-10,17-18,23-26H,(H,27,28)(H,29,30)",
49 | "Names": [
50 | "CKBGWXPNAUCVQQ-UHFFFAOYSA-N"
51 | ],
52 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N",
53 | "type": "compound",
54 | "Rule ID": null,
55 | "EC number": null,
56 | "Reaction SMILES": null,
57 | "Diameter": null,
58 | "Score": null,
59 | "Iteration": null
60 | }
61 | },
62 | {
63 | "data": {
64 | "SMILES": "O=O",
65 | "inSink": 1,
66 | "isSource": 0,
67 | "InChI": "InChI=1S/O2/c1-2",
68 | "Names": [
69 | "MYMOFIZGZYHOMD-UHFFFAOYSA-N",
70 | "O2 O2"
71 | ],
72 | "id": "MYMOFIZGZYHOMD-UHFFFAOYSA-N",
73 | "type": "compound",
74 | "Rule ID": null,
75 | "EC number": null,
76 | "Reaction SMILES": null,
77 | "Diameter": null,
78 | "Score": null,
79 | "Iteration": null
80 | }
81 | },
82 | {
83 | "data": {
84 | "SMILES": "O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12",
85 | "inSink": 0,
86 | "isSource": 0,
87 | "InChI": "InChI=1S/C21H15N3O2/c25-21(26)20-14(15-10-22-17-7-3-1-5-12(15)17)9-19(24-20)16-11-23-18-8-4-2-6-13(16)18/h1-11,22-24H,(H,25,26)",
88 | "Names": [
89 | "SFLGFRJGKHRRID-UHFFFAOYSA-N"
90 | ],
91 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N",
92 | "type": "compound",
93 | "Rule ID": null,
94 | "EC number": null,
95 | "Reaction SMILES": null,
96 | "Diameter": null,
97 | "Score": null,
98 | "Iteration": null
99 | }
100 | },
101 | {
102 | "data": {
103 | "SMILES": "O=C1NC(c2c[nH]c3ccccc23)=CC1=C1C(=O)Nc2ccccc21",
104 | "inSink": 0,
105 | "isSource": 1,
106 | "InChI": "InChI=1S/C20H13N3O2/c24-19-13(18-12-6-2-4-8-16(12)22-20(18)25)9-17(23-19)14-10-21-15-7-3-1-5-11(14)15/h1-10,21H,(H,22,25)(H,23,24)",
107 | "Names": [
108 | "deoxiviolacein",
109 | "OJUJNNKCVPCATE-UHFFFAOYSA-N"
110 | ],
111 | "id": "OJUJNNKCVPCATE-UHFFFAOYSA-N",
112 | "type": "compound",
113 | "Rule ID": null,
114 | "EC number": null,
115 | "Reaction SMILES": null,
116 | "Diameter": null,
117 | "Score": null,
118 | "Iteration": null
119 | }
120 | },
121 | {
122 | "data": {
123 | "SMILES": null,
124 | "inSink": null,
125 | "isSource": null,
126 | "InChI": null,
127 | "Names": null,
128 | "id": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
129 | "type": "reaction",
130 | "Rule ID": [
131 | "RR-02-8907c369787578b3-16-F",
132 | "RR-02-8907c369787578b3-14-F",
133 | "RR-02-8907c369787578b3-12-F",
134 | "RR-02-8907c369787578b3-10-F"
135 | ],
136 | "EC number": [
137 | "1.14.13.224"
138 | ],
139 | "Reaction SMILES": "O=C1NC(c2c[nH]c3ccccc23)=CC1=C1C(=O)Nc2ccccc21>>O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12.O=O",
140 | "Diameter": 16,
141 | "Score": 1.0,
142 | "ChemicalScore": 1.0,
143 | "Iteration": 1,
144 | "Stoechiometry": {
145 | "SFLGFRJGKHRRID-UHFFFAOYSA-N": 1,
146 | "MYMOFIZGZYHOMD-UHFFFAOYSA-N": 1
147 | }
148 | }
149 | },
150 | {
151 | "data": {
152 | "SMILES": null,
153 | "inSink": null,
154 | "isSource": null,
155 | "InChI": null,
156 | "Names": null,
157 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2",
158 | "type": "reaction",
159 | "Rule ID": [
160 | "RR-02-74068b9f6b2efdc1-16-F",
161 | "RR-02-74068b9f6b2efdc1-14-F",
162 | "RR-02-74068b9f6b2efdc1-12-F",
163 | "RR-02-74068b9f6b2efdc1-10-F"
164 | ],
165 | "EC number": [
166 | ""
167 | ],
168 | "Reaction SMILES": "O=C(O)c1[nH]c(-c2c[nH]c3ccccc23)cc1-c1c[nH]c2ccccc12>>N=C(C(=O)O)C(c1c[nH]c2ccccc12)C(C(=N)C(=O)O)c1c[nH]c2ccccc12",
169 | "Diameter": 16,
170 | "Score": 1.0,
171 | "ChemicalScore": 1.0,
172 | "Iteration": 2,
173 | "Stoechiometry": {
174 | "CKBGWXPNAUCVQQ-UHFFFAOYSA-N": 1
175 | }
176 | }
177 | },
178 | {
179 | "data": {
180 | "SMILES": null,
181 | "inSink": null,
182 | "isSource": null,
183 | "InChI": null,
184 | "Names": null,
185 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3",
186 | "type": "reaction",
187 | "Rule ID": [
188 | "RR-02-47e9577f4cb98f97-16-F",
189 | "RR-02-47e9577f4cb98f97-14-F",
190 | "RR-02-47e9577f4cb98f97-12-F",
191 | "RR-02-47e9577f4cb98f97-10-F"
192 | ],
193 | "EC number": [
194 | "1.21.98"
195 | ],
196 | "Reaction SMILES": "N=C(C(=O)O)C(c1c[nH]c2ccccc12)C(C(=N)C(=O)O)c1c[nH]c2ccccc12>>N=C(Cc1c[nH]c2ccccc12)C(=O)O.N=C(Cc1c[nH]c2ccccc12)C(=O)O",
197 | "Diameter": 16,
198 | "Score": 1.0,
199 | "ChemicalScore": 1.0,
200 | "Iteration": 3,
201 | "Stoechiometry": {
202 | "LKYWXXAVLLVJAS-UHFFFAOYSA-N": 2
203 | }
204 | }
205 | },
206 | {
207 | "data": {
208 | "SMILES": null,
209 | "inSink": null,
210 | "isSource": null,
211 | "InChI": null,
212 | "Names": null,
213 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4",
214 | "type": "reaction",
215 | "Rule ID": [
216 | "RR-02-0c9c5a5559e132c7-16-F",
217 | "RR-02-0c9c5a5559e132c7-14-F",
218 | "RR-02-0c9c5a5559e132c7-12-F",
219 | "RR-02-bbedd3c9b9124d30-10-F"
220 | ],
221 | "EC number": [
222 | "1.3.3.10",
223 | "1.4.3",
224 | "1.4.3.-"
225 | ],
226 | "Reaction SMILES": "N=C(Cc1c[nH]c2ccccc12)C(=O)O>>NC(Cc1c[nH]c2ccccc12)C(=O)O",
227 | "Diameter": 16,
228 | "Score": 0.453552175675181,
229 | "ChemicalScore": 1.0,
230 | "Iteration": 4,
231 | "Stoechiometry": {
232 | "QIVBCDIJIAJPQS-UHFFFAOYSA-N": 1
233 | }
234 | }
235 | }
236 | ],
237 | "edges": [
238 | {
239 | "data": {
240 | "target": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
241 | "source": "OJUJNNKCVPCATE-UHFFFAOYSA-N",
242 | "id": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1_=>_OJUJNNKCVPCATE-UHFFFAOYSA-N"
243 | }
244 | },
245 | {
246 | "data": {
247 | "target": "SFLGFRJGKHRRID-UHFFFAOYSA-N",
248 | "source": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
249 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N_=>_OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1"
250 | }
251 | },
252 | {
253 | "data": {
254 | "target": "MYMOFIZGZYHOMD-UHFFFAOYSA-N",
255 | "source": "OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1",
256 | "id": "MYMOFIZGZYHOMD-UHFFFAOYSA-N_=>_OJUJNNKCVPCATE-UHFFFAOYSA-N-RR-02-8907c369787578b3-16-F-0-1"
257 | }
258 | },
259 | {
260 | "data": {
261 | "target": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2",
262 | "source": "SFLGFRJGKHRRID-UHFFFAOYSA-N",
263 | "id": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2_=>_SFLGFRJGKHRRID-UHFFFAOYSA-N"
264 | }
265 | },
266 | {
267 | "data": {
268 | "target": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N",
269 | "source": "SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2",
270 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N_=>_SFLGFRJGKHRRID-UHFFFAOYSA-N-RR-02-74068b9f6b2efdc1-16-F-0-2"
271 | }
272 | },
273 | {
274 | "data": {
275 | "target": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3",
276 | "source": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N",
277 | "id": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3_=>_CKBGWXPNAUCVQQ-UHFFFAOYSA-N"
278 | }
279 | },
280 | {
281 | "data": {
282 | "target": "LKYWXXAVLLVJAS-UHFFFAOYSA-N",
283 | "source": "CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3",
284 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N_=>_CKBGWXPNAUCVQQ-UHFFFAOYSA-N-RR-02-47e9577f4cb98f97-16-F-0-3"
285 | }
286 | },
287 | {
288 | "data": {
289 | "target": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4",
290 | "source": "LKYWXXAVLLVJAS-UHFFFAOYSA-N",
291 | "id": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4_=>_LKYWXXAVLLVJAS-UHFFFAOYSA-N"
292 | }
293 | },
294 | {
295 | "data": {
296 | "target": "QIVBCDIJIAJPQS-UHFFFAOYSA-N",
297 | "source": "LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4",
298 | "id": "QIVBCDIJIAJPQS-UHFFFAOYSA-N_=>_LKYWXXAVLLVJAS-UHFFFAOYSA-N-RR-02-0c9c5a5559e132c7-16-F-0-4"
299 | }
300 | }
301 | ]
302 | }
303 | }
--------------------------------------------------------------------------------
/expected_results/pickles/tree_end_search.pkl.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/expected_results/pickles/tree_end_search.pkl.tar.gz
--------------------------------------------------------------------------------
/expected_results/results.csv:
--------------------------------------------------------------------------------
1 | parameter,value
2 | stop_at_first_result,False
3 | c_name,deoxiviolacein
4 | c_smiles,
5 | c_inchi,"InChI=1S/C20H13N3O2/c24-19-13(18-12-6-2-4-8-16(12)22-20(18)25)9-17(23-19)14-10-21-15-7-3-1-5-11(14)15/h1-10,21H,(H,22,25)(H,23,24)/b18-13+"
6 | fire_timeout,1
7 | organism_name,ecoli
8 | complementary_sink,
9 | itermax,1000
10 | expansion_width,10
11 | time_budget,7200
12 | max_depth,7
13 | minimal_visit_counts,1
14 | UCT_policy,Biochemical_UCT_1
15 | UCTK,20.0
16 | bias_k,0.0
17 | k_rave,0.0
18 | use_RAVE,False
19 | penalty,-1
20 | full_state_reward,2
21 | Rollout_policy,Rollout_policy_random_uniform_on_biochemical_multiplication_score
22 | max_rollout,3
23 | chemical_scoring,SubandprodChemicalScorer
24 | biological_score_cut_off,0.1
25 | substrate_only_score_cut_off,0.7
26 | chemical_score_cut_off,0.7
27 | virtual_visits,0
28 | progressive_bias_strategy,0
29 | progressive_widening,False
30 | diameter,"[10, 12, 14, 16]"
31 | EC_filter,
32 | tree_to_complete,
33 | found_pathways,4
34 | TIME_EXECUTION,3.58
35 | STOP_REASON,iteration
36 | NUMBER_ITERATION,999
37 |
--------------------------------------------------------------------------------
/move.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the move class that contains:
3 | - compound it applies to
4 | - rsmart
5 | - rid
6 | - set (becuase a signle rule cna generate multiple product sets)
7 | - biological_score
8 | - chemical_score
9 | """
10 |
11 | # General utilities
12 | import logging
13 | import csv
14 |
15 | class Move(object):
16 | """
17 | Basic move object. At the moment will have only attributes, no function per say.
18 | """
19 |
20 | logger = logging.getLogger(__name__)
21 | def __init__(self,
22 | rsmart,
23 | rid,
24 | compound_id,
25 | rsmiles = None,
26 | set_number = 0,
27 | chemical_score = 0,
28 | chemical_substrate_score = 0,
29 | biological_score = 0,
30 | product_list = [],
31 | EC_number = ["EC: None"],
32 | compound_index = 0, stoechiometry = {}):
33 | self.rsmart = rsmart
34 | if rsmiles is None:
35 | self.rsmiles = self.rsmart
36 | else:
37 | self.rsmiles = rsmiles
38 | self.rid = rid
39 | self.compound_id = compound_id
40 | self.set_number = set_number
41 | self.chemical_score = chemical_score
42 | self.chemical_substrate_score = chemical_substrate_score
43 | self.biological_score = biological_score
44 | self.EC_numbers = EC_number
45 | self.product_list = product_list
46 | self.name = "{}-{}-{}".format(self.compound_id, self.rid, str(self.set_number))
47 | self.synonyms = [self.rid]
48 | self.RAVE_visits = 0
49 | self.RAVE_total_score = 0
50 | self.RAVE_average_score = 0
51 | self.stoechiometry = stoechiometry
52 |
53 | def set_set_number(self, set_number):
54 | self.set_number = set_number
55 | self.name = "{}-{}-{}".format(self.compound_id, self.rid, str(self.set_number))
56 |
57 | def set_rsmart(self, rsmart):
58 | self.rsmart = rsmart
59 |
60 | def set_rsmiles(self, rsmiles):
61 | self.rsmiles = rsmiles
62 |
63 | def calculate_rsmiles(self, substrate):
64 | """
65 | Smiles of the actual transformation that is happening between the substrate and the products
66 | """
67 | sub_smiles = "{}".format(substrate.csmiles)
68 | prod_smiles = ".".join([prod.csmiles for prod in self.full_product_list()])
69 | self.rsmiles = "{}>>{}".format(sub_smiles, prod_smiles)
70 |
71 | def set_chemical_score(self, chemical_score):
72 | self.chemical_score = chemical_score
73 |
74 | def set_chemical_substrate_score(self, chemical_substrate_score):
75 | self.chemical_substrate_score = chemical_substrate_score
76 |
77 | def delete_intermediate_chemical_score(self):
78 | del self.original_substrates_list
79 | del self.original_products_list_list
80 |
81 | def set_intermediate_chemical_score(self, original_substrates_list, original_products_list_list):
82 | self.original_substrates_list = original_substrates_list
83 | self.original_products_list_list = original_products_list_list
84 |
85 | def set_id(self, id):
86 | self.id = id
87 |
88 | def set_EC_numbers(self, EC_numbers):
89 | self.EC_numbers = EC_numbers
90 |
91 | def set_biological_score(self, biological_score):
92 | self.biological_score = biological_score
93 |
94 | def set_product_list(self, product_list):
95 | self.product_list = product_list
96 |
97 | def set_stoechiometry(self, stoechiometry):
98 | self.stoechiometry = stoechiometry
99 |
100 | def __repr__(self):
101 | return self.name
102 |
103 | def print_all_attributes(self):
104 | text = "For move {}, attributes are: rid: {}, cid: {} \n".format(self.name, self.rid, self.compound_id)
105 | text_next = "set: {}, chem_score: {}, bio score: {} \n".format(self.set_number, self.chemical_score, self.biological_score)
106 | text_last = "product_list: {}, stoechiometry: {} \n".format(self.product_list, self.stoechiometry)
107 | text_appendix = "EC numbers are {}".format(self.EC_numbers)
108 | return (text + text_next + text_last + text_appendix)
109 |
110 | def full_product_list(self):
111 | full_list = []
112 | ordered_product_list = sorted(self.product_list, key = lambda item: self.stoechiometry[item.InChIKey])
113 | for product in ordered_product_list:
114 | for i in range(self.stoechiometry[product.InChIKey]):
115 | full_list.append(product)
116 | return full_list
117 |
118 | def _calculate_simles_from_move(self):
119 | sub = "{}".format()
120 |
121 | def clone(self):
122 | cloned_move = Move(
123 | rsmart=self.rsmart,
124 | rid=self.rid,
125 | compound_id=self.compound_id,
126 | set_number=self.set_number,
127 | chemical_score=self.chemical_score,
128 | biological_score=self.biological_score,
129 | product_list=self.product_list,
130 | EC_number=self.EC_numbers,
131 | stoechiometry=self.stoechiometry,
132 | )
133 | try:
134 | cloned_move.set_intermediate_chemical_score(
135 | self.original_substrates_list,
136 | self.original_products_list_list,
137 | )
138 | except AttributeError:
139 | pass
140 | return cloned_move
141 |
142 | def add_synonym(self, move):
143 | """
144 | Adds a synonym to this move.
145 | (When another move was deemed equal to current move (self))
146 | """
147 | if move.rid not in self.synonyms:
148 | self.synonyms.append(move.rid)
149 | for EC in move.EC_numbers:
150 | if EC not in self.EC_numbers:
151 | self.EC_numbers.append(EC)
152 | if self.biological_score * self.chemical_score < move.biological_score * move.chemical_score:
153 | self.biological_score = move.biological_score
154 | self.chemical_score = move.chemical_score
155 | self.stoechiometry = move.stoechiometry
156 |
157 | def eq_full_inchi_key(self, other):
158 | """
159 | Tow moves are identical if they
160 | - apply to the same compound
161 | - generate the same products
162 | """
163 | compound_eq = (self.compound_id == other.compound_id)
164 | products_eq = len(self.product_list) == len(other.product_list)
165 | for product in self.product_list:
166 | products_eq = products_eq and (product.in_list(other.product_list, main_layer = False))
167 | return(compound_eq and products_eq)
168 |
169 | def eq_main_layer(self, other):
170 | """
171 | Tow moves are identical if they
172 | - apply to the same compound
173 | - generate the same products
174 | """
175 | compound_eq = (self.compound_id == other.compound_id)
176 | products_eq = len(self.product_list) == len(other.product_list)
177 | for product in self.product_list:
178 | products_eq = products_eq and (product.in_list(other.product_list, main_layer = True))
179 | return(compound_eq and products_eq)
180 |
181 | def in_list(self, list_moves, main_layer = False):
182 | in_list = False
183 | for move_in_list in list_moves:
184 | if main_layer:
185 | equality = self.eq_main_layer(move_in_list)
186 | if equality:
187 | in_list = True
188 | move_in_list.add_synonym(self)
189 | break
190 | else:
191 | equality = self.eq_full_inchi_key(move_in_list)
192 | if equality:
193 | in_list = True
194 | move_in_list.add_synonym(self)
195 | break
196 | return(in_list)
197 |
198 | def update(self, result, visit_number = 1):
199 | """
200 | Values are used only for RAVE implementation.
201 | """
202 | self.RAVE_visits = self.RAVE_visits + visit_number
203 | self.RAVE_total_score = self.RAVE_total_score + result * visit_number
204 | self.RAVE_average_score = self.RAVE_total_score/self.RAVE_visits
205 |
--------------------------------------------------------------------------------
/organisms.py:
--------------------------------------------------------------------------------
1 | """
2 | Defines organisms as chemical_compounds_state objects.
3 | Unpickled after calculation when setting up RP3.
4 | """
5 |
6 | # General utilities
7 | import logging
8 | import pickle
9 | import os
10 | import csv
11 | import sys
12 |
13 | from config import *
14 |
15 | # RP3 specific objects
16 | from compound import Compound, unpickle
17 | from chemical_compounds_state import ChemicalCompoundState
18 | from rdkit.Chem import AllChem
19 | from utilities.reactor.Utils import standardize_chemical, standardize_results, handle_results, ChemConversionError
20 | from utilities.reactor.cli import worker_match, worker_fire, RuleConversionError
21 |
22 |
23 | class NotReady(Exception):
24 | """Raised when organisms or rules have not been caculated in advance"""
25 |
26 | def __init__(self, msg = "Not Ready. Need to run set-up scripts"):
27 | self._msg = msg
28 |
29 | def __str__(self):
30 | return self._msg
31 |
32 |
33 | def import_organism_from_csv(csv_file, add_Hs=True):
34 | with open(csv_file) as csv_handle:
35 | dict_reader = csv.DictReader(csv_handle, delimiter=",")
36 | compound_list = []
37 | for row in dict_reader:
38 | name = row["name"]
39 | inchi = row["inchi"]
40 | if inchi is None or inchi == "None" or inchi == "":
41 | pass
42 | else:
43 | try:
44 | if name.startswith("InChI"):
45 | compound = Compound(
46 | InChI=inchi, heavy_standardisation=True, force_add_H=add_Hs
47 | )
48 | else:
49 | compound = Compound(
50 | InChI=inchi,
51 | name=name,
52 | heavy_standardisation=True,
53 | force_add_H=add_Hs,
54 | )
55 | if not compound.in_list(compound_list, main_layer = False):
56 | compound_list.append(compound)
57 | except ChemConversionError as e:
58 | logging.error("For compound {} with inchi {}: error ChemConversionError".format(name, inchi))
59 | organism = ChemicalCompoundState(compound_list, main_layer = False)
60 | # organism.set_main_layer(True)
61 | return(organism)
62 |
63 |
64 | organisms_data_path = "{}/organisms".format(DATA_PATH)
65 | if not os.path.exists(organisms_data_path):
66 | os.mkdir(organisms_data_path)
67 |
68 | if not os.path.exists(organisms_data_path + '/state_iML1515_chassis_H.pkl'):
69 | logging.error("Please run calculate_organisms script")
70 | raise NotReady
71 |
72 |
73 | Test_organism_H = unpickle(file_name = "{}".format('Test_organism_H'), type = 'state', folder_address = organisms_data_path)
74 | ecoli_chassis_H = unpickle(file_name = "{}".format('iML1515_chassis_H'), type = 'state', folder_address = organisms_data_path)
75 | detectable_cmpds_H = unpickle(file_name = "{}".format('detectable_cmpds_H'), type = 'state', folder_address = organisms_data_path)
76 | core_ecoli_H = unpickle(file_name = "{}".format('core_ecoli_H'), type = 'state', folder_address = organisms_data_path)
77 | bsubtilis_H = unpickle(file_name = "{}".format('bsubtilis_H'), type = 'state', folder_address = organisms_data_path)
78 | iJO1366_chassis_H = unpickle(file_name = "{}".format('iJO1366_chassis_H'), type = 'state', folder_address = organisms_data_path)
79 |
80 |
81 | Test_organism_noH = unpickle(file_name = "{}".format('Test_organism_noH'), type = 'state', folder_address = organisms_data_path)
82 | ecoli_chassis_noH = unpickle(file_name = "{}".format('iML1515_chassis_noH'), type = 'state', folder_address = organisms_data_path)
83 | detectable_cmpds_noH = unpickle(file_name = "{}".format('detectable_cmpds_noH'), type = 'state', folder_address = organisms_data_path)
84 | core_ecoli_noH = unpickle(file_name = "{}".format('core_ecoli_noH'), type = 'state', folder_address = organisms_data_path)
85 | bsubtilis_noH = unpickle(file_name = "{}".format('bsubtilis_noH'), type = 'state', folder_address = organisms_data_path)
86 | iJO1366_chassis_noH = unpickle(file_name = "{}".format('iJO1366_chassis_noH'), type = 'state', folder_address = organisms_data_path)
87 |
--------------------------------------------------------------------------------
/pathway.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the pathway objects for visualisation and export
3 | """
4 |
5 | # General utilities
6 | import logging
7 | import csv
8 | import copy
9 | import json
10 | import pickle
11 | # RP3 specific objects
12 | from compound import Compound
13 | from move import Move
14 | from chemical_compounds_state import ChemicalCompoundState
15 | from organisms import Test_organism_H
16 |
17 |
18 | class Pathway(object):
19 | """
20 | Pathway object.
21 | Has methods for quick visualisation as well as export to json (for visualisation and treatment)
22 | Also has cloning and compound addition
23 | """
24 | logger = logging.getLogger(__name__)
25 |
26 | def __init__(self, first_iteration = -1, target = None, compounds = [], moves = [],
27 | file_to_save = "temporary_pathway_json", main_layer = True,
28 | organism = Test_organism_H, edges = [], nodes_compounds = [], nodes_transformations = []):
29 | """
30 | Initialising a pathway object.
31 | A compound has an ID and a dict with chemical structures
32 | A reaction links 2 compounds and has a smart, scores etc
33 | self.compounds is a dictionnary of ID: chemical_struct_of_compound
34 | Remarks:
35 | - a pathway can only be defined for a fully solved Node (ie: in the Tree, not in rollout)
36 | - it needs to verify at each step what products are formed
37 | as those could have been deleted in the tree search (already in state)
38 | """
39 | self.first_iteration = first_iteration
40 | self.target = target
41 | self.organism = organism
42 | self.main_layer = main_layer
43 | self.compounds = compounds
44 | self.moves = moves
45 | self.file_to_save = file_to_save
46 | self.nodes_compounds = nodes_compounds
47 | self.nodes_transformations = nodes_transformations
48 | self.edges = edges
49 | self.pathway_as_dict = None
50 |
51 | def __eq__(self, other):
52 | """
53 | Two pathways are identical if their compounds and moves are identical
54 | """
55 | node_compounds_equal = len(self.nodes_compounds) == len(other.nodes_compounds)
56 | node_trasnfo_equal = len(self.nodes_transformations) == len(other.nodes_transformations)
57 | node_edges_equal = len(self.edges) == len(other.edges)
58 | compounds_equal = len(self.compounds) == len(other.compounds)
59 | if compounds_equal:
60 | for compound in self.compounds:
61 | in_other = compound.in_list(other.compounds, main_layer = True)
62 | if not in_other:
63 | compounds_equal = False
64 | break
65 | moves_equal = len(self.moves) == len(other.moves)
66 | if moves_equal:
67 | for move in self.moves:
68 | in_other = move.in_list(other.moves, main_layer = True)
69 | if not in_other:
70 | moves_equal = False
71 | break
72 | equality = compounds_equal and moves_equal and node_compounds_equal and node_trasnfo_equal and node_edges_equal
73 | return (equality)
74 |
75 | def __repr__(self):
76 | """
77 | Print list of compoudns and list of moves
78 | """
79 | rep = 'Compound \n'
80 | for compound in self.compounds:
81 | rep = rep + str(compound) + "\n"
82 | rep = rep + 'Edges \n'
83 | for edge in self.edges:
84 | rep = rep + edge["data"]["id"] + "\n"
85 | return(rep)
86 |
87 | def all_attributes_with_nodes(self):
88 | """
89 | Print list of compounds and list of moves
90 | """
91 | rep = 'Compound \n'
92 | for compound in self.compounds:
93 | rep = rep + str(compound) + "\n"
94 | rep = rep + 'Edges \n'
95 | for edge in self.edges:
96 | rep = rep + edge["data"]["id"] + "\n"
97 | for node_cp in self.nodes_compounds:
98 | rep = rep + node_cp["data"]["id"] + "\n"
99 | for node_tf in self.nodes_transformations:
100 | rep = rep + node_tf["data"]["id"] + "\n"
101 | return(rep)
102 |
103 | def set_file_to_save(self, file_to_save):
104 | self.file_to_save = file_to_save
105 |
106 | def set_main_layer(self, main_layer):
107 | self.main_layer = main_layer
108 |
109 | def set_first_iteration(self, first_iteration):
110 | self.first_iteration = first_iteration
111 |
112 | def clone(self):
113 | """ Cloning """
114 | duplicated_pathway = Pathway(
115 | first_iteration=self.first_iteration,
116 | organism=self.organism,
117 | main_layer=self.main_layer,
118 | target=self.target,
119 | compounds=[cmp.clone() for cmp in self.compounds],
120 | moves=[mv.clone() for mv in self.moves],
121 | edges=copy.deepcopy(self.edges),
122 | nodes_compounds=copy.deepcopy(self.nodes_compounds),
123 | nodes_transformations=copy.deepcopy(self.nodes_transformations),
124 | )
125 | return duplicated_pathway
126 |
127 | def save(self, file_name = None, folder_address = "pickled_data"):
128 | if file_name is None:
129 | base_name = self.file_to_save
130 | file_saving = open('{}/pathway_{}.pkl'.format(folder_address, file_name), 'wb')
131 | pickle.dump(self, file_saving)
132 |
133 | def add_compound(self, compound, in_sink = None, is_source = 0):
134 | """
135 | Adding a compound object to the pathway.
136 | """
137 | if is_source:
138 | self.target = compound
139 | if not compound.in_list(self.compounds, main_layer = self.main_layer):
140 | self.compounds.append(compound)
141 | if in_sink is None:
142 | if self.organism.compound_in_state(compound):
143 | in_sink = 1
144 | else:
145 | in_sink = 0
146 | data_dict = {
147 | 'SMILES': compound.csmiles,
148 | 'inSink':in_sink,
149 | 'isSource': is_source,
150 | 'InChI': compound.InChI,
151 | 'Names': compound.synonyms_names, # If I want synonyms, keep them
152 | 'id': compound.InChIKey,
153 | 'type': 'compound',
154 | 'Rule ID': None,
155 | 'EC number': None,
156 | 'Reaction SMILES': None,
157 | 'Diameter': None,
158 | 'Score': None,
159 | 'Iteration': None
160 | }
161 | self.nodes_compounds.append({"data": data_dict})
162 | else:
163 | self.logger.warning("Compound {} is already in compounds".format(compound))
164 |
165 | def clean_up(self, move, depth):
166 | str = "{}-{}-{}-{}".format(move.compound_id, move.rid, move.set_number, depth)
167 | return(str)
168 |
169 | def add_reaction(self, move, depth = 1):
170 | """
171 | Adding a reaction to the pathway.
172 | """
173 | if not move.in_list(self.moves):
174 | self.moves.append(move)
175 | move_compound_id_present = False
176 | for cp in self.compounds:
177 | for sym in cp.synonyms_names:
178 | if sym == move.compound_id:
179 | move_compound_id_present = True
180 | move_compound_ID = cp.InChIKey
181 | break
182 | if not move_compound_id_present:
183 | self.logger.warning("Trying to add move {} when compound {} is not in the pathway".format(move, move.compound_id))
184 |
185 | for product in move.product_list:
186 | if not product.in_list(self.compounds):
187 | # Adding the products of the pathway
188 | self.add_compound(product, in_sink = None, is_source = 0)
189 |
190 | cleaned_up_moved = self.clean_up(move, depth)
191 | try:
192 | diameter = int(move.rid.split("-")[3])
193 | except:
194 | diameter = 42
195 | data_dict = {
196 | "SMILES": None,
197 | "inSink": None,
198 | "isSource": None,
199 | "InChI": None,
200 | "Names": None,
201 | "id": cleaned_up_moved,
202 | "type": "reaction",
203 | "Rule ID": move.synonyms,
204 | "EC number": move.EC_numbers,
205 | "Reaction SMILES": move.rsmiles,
206 | "Diameter": diameter,
207 | "Score": move.biological_score,
208 | "ChemicalScore": move.chemical_score,
209 | "Iteration": depth,
210 | "Stoechiometry": move.stoechiometry
211 | }
212 | self.nodes_transformations.append({"data": data_dict})
213 | # Adding all the edges:
214 | # from compound to reaction (move as target, compound as source)
215 | # From reactions to compound (move as source, product as target)
216 | data_dict = {
217 | "target" : cleaned_up_moved,
218 | "source" : move_compound_ID,
219 | "id" : "{}_=>_{}".format(cleaned_up_moved, move.compound_id)
220 | }
221 | self.edges.append({"data": data_dict})
222 | for product in move.product_list:
223 | data_dict = {
224 | "target" : product.name,
225 | "source" : cleaned_up_moved,
226 | "id" : "{}_=>_{}".format(product.name, cleaned_up_moved)
227 | }
228 | self.edges.append({"data": data_dict})
229 | else:
230 | self.logger.debug("Move {} is already in moves".format(move))
231 |
232 | def jsonify_scope_viewer(self):
233 | """
234 | Use scope viewer to visualise pathways before the DBTL advances more.
235 | THe json file is a dict composed of one item called elements.
236 | The elements values is a dict composed of "nodes" and "edges"
237 | Nodes is a list of compounds, or reactions, with:
238 | """
239 | if self.pathway_as_dict is None:
240 | self.nodes_compounds.reverse()
241 | self.pathway_as_dict = {"elements": {"nodes": self.nodes_compounds + self.nodes_transformations,
242 | "edges": self.edges}}
243 | with open(self.file_to_save, "w") as json_handler:
244 | json.dump(self.pathway_as_dict, json_handler, indent = 2)
245 |
246 | def export_as_json_dict(self):
247 | """
248 | To export as a dict without needing to read and write the json.
249 | """
250 | if self.pathway_as_dict is None:
251 | self.nodes_compounds.reverse()
252 | self.pathway_as_dict = {"elements": {"nodes": self.nodes_compounds + self.nodes_transformations,
253 | "edges": self.edges}}
254 | return(self.pathway_as_dict)
255 |
256 |
257 | def __cli():
258 | """Command line interface. Was actually used to make quick
259 | tests before implementing them in the testing file"""
260 | logging.basicConfig(
261 | stream=sys.stderr, level=logging.INFO,
262 | datefmt='%d/%m/%Y %H:%M:%S',
263 | format='%(asctime)s -- %(levelname)s -- %(message)s'
264 | )
265 | logging.warning("CLI is not available for Pathway")
266 |
267 |
268 | if __name__ == "__main__":
269 | __cli()
270 |
--------------------------------------------------------------------------------
/pathway_scoring.py:
--------------------------------------------------------------------------------
1 | """
2 | Defines the pathway scoring functions.
3 | Can take as inputs both Pathway objects and json dictionnaries exported from Pathways.
4 | """
5 |
6 | import random
7 | import numpy as np
8 | import json
9 | import os
10 | # RP3 - specific objects
11 | from pathway import Pathway
12 |
13 |
14 | def geo_mean(iterable):
15 | a = np.array(iterable)
16 | return a.prod()**(1.0/len(a))
17 |
18 | # def geo_mean_overflow(iterable):
19 | # a = np.log(iterable)
20 | # return np.exp(a.sum()/len(a))
21 |
22 | class PathwayScoring(object):
23 | """
24 | Defines Pathway Scorer object.
25 | """
26 | def __init__(self, scoring_function = None, scoring_json_function = None):
27 | if scoring_function is None:
28 | pass
29 | else:
30 | self.scoring_function = scoring_function
31 | if scoring_json_function is None:
32 | pass
33 | else:
34 | self.scoring_json_function = scoring_json_function
35 |
36 | def __repr__(self):
37 | """
38 | Name the used scorer.
39 | Raises an error is the class is not properly instantiated
40 | """
41 | return(self.name)
42 |
43 | def calculate(self, pathway):
44 | score = self.scoring_function(pathway)
45 | return(score)
46 |
47 | def calculate_json(self, pathway):
48 | score = self.scoring_json_function(pathway)
49 | return(score)
50 |
51 | def pseudo_random(pathway):
52 | score = random.uniform(0, 10)
53 | return(score)
54 |
55 | class ConstantPathwayScoring(PathwayScoring):
56 | """
57 | Returns a constant reward, whichever the pathway.
58 | """
59 | def __init__(self, reward = 10):
60 | PathwayScoring.__init__(self)
61 | self.reward = reward
62 | self.scoring_function = self.scoring_function()
63 | self.scoring_json_function = self.scoring_json_function()
64 | self.name = "ConstantPathwayScoring of {}".format(reward)
65 |
66 | def set_reward(self,reward):
67 | # For changing the reward of the object
68 | self.reward = reward
69 | self.scoring_function = self.scoring_function()
70 | self.scoring_json_function = self.scoring_json_function()
71 |
72 | def scoring_function(self):
73 | def pathway_scoring(pathway):
74 | return(self.reward)
75 | return(pathway_scoring)
76 |
77 | def scoring_json_function(self):
78 | def pathway_scoring(pathway):
79 | return(self.reward)
80 | return(pathway_scoring)
81 |
82 | class BiologicalPathwayScoring(PathwayScoring):
83 | """
84 | Returns the geometric mean of biological scores in the Pathway.
85 | """
86 | def __init__(self):
87 | PathwayScoring.__init__(self)
88 | self.scoring_function = self.scoring_function()
89 | self.scoring_json_function = self.scoring_json_function()
90 | self.name = "BiologicalPathwayScoring"
91 |
92 | def scoring_function(self):
93 | def pathway_scoring(pathway):
94 | scores = []
95 | for move in pathway.nodes_transformations:
96 | scores.append(move["data"]["Score"])
97 | return(geo_mean(scores))
98 | return(pathway_scoring)
99 |
100 | def scoring_json_function(self):
101 | def pathway_scoring(pathway):
102 | scores = []
103 | for move in pathway["elements"]["nodes"]:
104 | if move["data"]["type"] == "reaction":
105 | scores.append(move["data"]["Score"])
106 | return(geo_mean(scores))
107 | return(pathway_scoring)
108 |
109 | class ChemicalPathwayScoring(PathwayScoring):
110 | """
111 | Returns the geometric mean of chemical scores in the Pathway.
112 | """
113 | def __init__(self):
114 | PathwayScoring.__init__(self)
115 | self.scoring_function = self.scoring_function()
116 | self.scoring_json_function = self.scoring_json_function()
117 | self.name = "ChemicalPathwayScoring"
118 |
119 | def scoring_function(self):
120 | def pathway_scoring(pathway):
121 | scores = []
122 | for move in pathway.nodes_transformations:
123 | scores.append(move["data"]["ChemicalScore"])
124 | return(geo_mean(scores))
125 | return(pathway_scoring)
126 |
127 | def scoring_json_function(self):
128 | def pathway_scoring(pathway):
129 | scores = []
130 | for move in pathway["elements"]["nodes"]:
131 | if move["data"]["type"] == "reaction":
132 | scores.append(move["data"]["ChemicalScore"])
133 | return(geo_mean(scores))
134 | return(pathway_scoring)
135 |
136 | class BiochemicalPathwayScoring(PathwayScoring):
137 | """
138 | Returns the geometric mean of biochemical scores in the Pathway.
139 | """
140 | def __init__(self):
141 | PathwayScoring.__init__(self)
142 | self.scoring_function = self.scoring_function()
143 | self.scoring_json_function = self.scoring_json_function()
144 | self.name = "ChemicalPathwayScoring"
145 |
146 | def scoring_function(self):
147 | def pathway_scoring(pathway):
148 | scores = []
149 | for move in pathway.nodes_transformations:
150 | scores.append(move["data"]["ChemicalScore"] * move["data"]["Score"])
151 | return(geo_mean(scores))
152 | return(pathway_scoring)
153 |
154 | def scoring_json_function(self):
155 | def pathway_scoring(pathway):
156 | scores = []
157 | for move in pathway["elements"]["nodes"]:
158 | if move["data"]["type"] == "reaction":
159 | scores.append(move["data"]["Score"] * move["data"]["ChemicalScore"])
160 | return(geo_mean(scores))
161 | return(pathway_scoring)
162 |
163 | RandomPathwayScorer = PathwayScoring(scoring_function = pseudo_random)
164 | constant_pathway_scoring = ConstantPathwayScoring(reward = 10)
165 | null_pathway_scoring = ConstantPathwayScoring(reward = 0)
166 | biological_pathway_scoring = BiologicalPathwayScoring()
167 | chemical_pathway_scoring = ChemicalPathwayScoring()
168 | biochemical_pathway_scoring = BiochemicalPathwayScoring()
169 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.commitizen]
2 | name = "cz_conventional_commits"
3 | version = "1.1.0"
4 | version_provider = "commitizen"
5 | tag_format = "$version"
6 | version_type = "semver2"
--------------------------------------------------------------------------------
/representation.py:
--------------------------------------------------------------------------------
1 | """
2 | The aim of this file is to define a representation class for tree printing.
3 | It is useful to switch between the 2 for terminal or text file output.
4 | """
5 |
6 | class Representation(object):
7 | """ Contains all things necessary for representing my nodes and trees"""
8 | def __init__(self, delimiter = "|", color = "red", printing_solved = "- solved"):
9 | self.delimiter = delimiter # Delimiter between nodes
10 | if color == "red":
11 | self.color_begin = '\033[91m'
12 | self.color_end = '\033[0m'
13 | elif color == "":
14 | self.color_begin = ''
15 | self.color_end = ''
16 | else:
17 | raise NotImplementedError
18 | self.printing_solved = printing_solved
19 |
20 | Test_representation = Representation(delimiter = "|", color = "red", printing_solved = "")
21 | Test_to_file = Representation(delimiter = "|", color = "", printing_solved = "- solved")
22 |
--------------------------------------------------------------------------------
/rewarding.py:
--------------------------------------------------------------------------------
1 | """
2 | Defines the possible rewards for rollout.
3 | Can be augmented for more complex policies using simialr scheme as Rollout or UCT policies.
4 | Is defined through CLI in the Tree script.
5 | """
6 |
7 | class RolloutRewards(object):
8 | """
9 | Defines penalty and rewards for the rollout if it's in the chasis.
10 | """
11 | def __init__(self, penalty, full_state_reward):
12 | self.penalty = penalty
13 | self.full_state_reward = full_state_reward
14 |
15 | def __repr__(self):
16 | """Reward representation is its values"""
17 | return("Penalty is {} and full state reward is {}".format(self.penalty, self.full_state_reward))
18 |
19 | Basic_Rollout_Reward = RolloutRewards(penalty = -1, full_state_reward = 2)
20 |
--------------------------------------------------------------------------------
/rule_sets_examples.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the rules examples that will be used throughout the tests.
3 | The aim is to
4 | """
5 |
6 | import logging
7 | import csv
8 | import os
9 |
10 | rule_10_subset_address = "{}/tests/data/rules_r10_subset.tsv".format(os.path.dirname(__file__))
11 | applicable_rules_10_dict = {}
12 | with open(rule_10_subset_address, "r") as csv_file:
13 | fieldnames = ["Rule_ID", "Reaction_ID", "Diameter", "Direction", "Rule_order", "Rule_SMARTS", "Substrate_ID", "Substrate_SMILES", "Product_IDs", "Product_SMILES", "Rule_SMILES", "Rule_SMARTS_lite"]
14 | csv_reader = csv.DictReader(csv_file, delimiter = '\t', fieldnames = fieldnames)
15 | next(csv_reader) # skip first line
16 | for element in csv_reader:
17 | applicable_rules_10_dict[element["Rule_ID"]] = {"Rule_SMARTS": element["Rule_SMARTS"],
18 | "biological_score": 1,
19 | "EC_number": ["EC: None"],
20 | "Rule_SMILES": element["Rule_SMILES"]}
21 |
22 |
23 | rule_2_subset_address = "{}/tests/data/rules_r2_subset.tsv".format(os.path.dirname(__file__))
24 | applicable_rules_2_dict = {}
25 | with open(rule_2_subset_address, "r") as csv_file:
26 | fieldnames = ["Rule_ID", "Reaction_ID", "Diameter", "Direction", "Rule_order", "Rule_SMARTS", "Substrate_ID", "Substrate_SMILES", "Product_IDs", "Product_SMILES", "Rule_SMILES", "Rule_SMARTS_lite"]
27 | csv_reader = csv.DictReader(csv_file, delimiter = '\t', fieldnames = fieldnames)
28 | next(csv_reader) # skip first line
29 | for element in csv_reader:
30 | applicable_rules_2_dict[element["Rule_ID"]] = {"Rule_SMARTS": element["Rule_SMARTS"],
31 | "biological_score": 1,
32 | "EC_number": ["EC: None"],
33 | "Rule_SMILES": element["Rule_SMILES"]}
34 |
35 |
36 | rule_mixed_subset_address = "{}/tests/data/rules_mixed_subset.tsv".format(os.path.dirname(__file__))
37 | applicable_rules_mixed_dict = {}
38 | with open(rule_mixed_subset_address, "r") as csv_file:
39 | fieldnames = ["Rule_ID", "Reaction_ID", "Diameter", "Direction", "Rule_order", "Rule_SMARTS", "Substrate_ID", "Substrate_SMILES", "Product_IDs", "Product_SMILES", "Rule_SMILES", "Rule_SMARTS_lite"]
40 | csv_reader = csv.DictReader(csv_file, delimiter = '\t', fieldnames = fieldnames)
41 | next(csv_reader) # skip first line
42 | for element in csv_reader:
43 | applicable_rules_mixed_dict[element["Rule_ID"]] = {"Rule_SMARTS": element["Rule_SMARTS"],
44 | "biological_score": 1,
45 | "EC_number": ["EC: None"],
46 | "Rule_SMILES": element["Rule_SMILES"]}
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setup(
7 | name="rp3",
8 | version="0.0",
9 | author="Mathilde Koch",
10 | author_email="mathilde.koch@inra.fr",
11 | description="Perform retrosynthesis with Monte-Carlo Tree Search algorithm",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/brsynth/RetroPath3",
15 | packages=find_packages(),
16 | python_requires=">=3.6",
17 | include_package_data=True,
18 | )
19 |
--------------------------------------------------------------------------------
/supplement_finder.py:
--------------------------------------------------------------------------------
1 | """
2 | Find supplements to complete a Tree.
3 | Read argparser for details of arguments.
4 | Principle is to identify compounds needed to complete chemical states.
5 | """
6 |
7 | # General utilities
8 | import os
9 | import sys
10 | import time
11 | import signal
12 | import datetime
13 | import logging
14 | import argparse
15 | import pickle
16 | import json
17 |
18 | import random
19 |
20 | from Tree import Tree
21 |
22 | def unpickle(file_name, type = "tree", folder_address = "pickled_data"):
23 | with open('{}/{}_{}.pkl'.format(folder_address, type, file_name), 'rb') as input:
24 | return(pickle.load(input))
25 |
26 |
27 | def run(tree, number_suggestions, rescued_states, folder_to_save, database = None):
28 | potential_supplements = {}
29 | # Extracting all potential supplements from the Tree.
30 | nodes_to_treat = [tree.root_node]
31 | while nodes_to_treat != []:
32 | node = nodes_to_treat[0]
33 | del nodes_to_treat[0]
34 | state = node.state
35 | supplement = state.GetSupplement_from_InChI_Keys()
36 | if not supplement is None:
37 | if supplement.InChIKey in potential_supplements.keys():
38 | potential_supplements[supplement.InChIKey]["rescued_states"] = potential_supplements[supplement.InChIKey]["rescued_states"] + 1
39 | else:
40 | information_to_keep = {"structure": supplement.csmiles,
41 | "name_from_MCTS": supplement.name,
42 | "synonyms_names": supplement.synonyms_names,
43 | "rescued_states":1}
44 | potential_supplements[supplement.InChIKey] = information_to_keep
45 | if node.terminal:
46 | pass
47 | else:
48 | for child in node.children:
49 | nodes_to_treat.append(child)
50 | logging.info("Potential supplements without filtering: {}".format(len(potential_supplements.keys())))
51 | # Sorting according to number of rescued states
52 | sorted_supplements = [suppl for suppl, value in sorted(potential_supplements.items(), key=lambda item: item[1]["rescued_states"], reverse=True) if value["rescued_states"] >= rescued_states]
53 | logging.info("Potential supplements after filtering with {} rescued states: {}".format(rescued_states, len(sorted_supplements)))
54 |
55 | # Filtering according to presence in a database of interest
56 | if database is None:
57 | supplements_of_interest = sorted_supplements
58 | logging.warning("Not checking availability within a Database of interest")
59 | else:
60 | supplements_of_interest = []
61 | for element in sorted_supplements:
62 | if element in database.keys():
63 | logging.info("Element {} (with {} pathways) is in database ({})".format(element, potential_supplements[element], database[element]))
64 | supplements_of_interest.append(element)
65 | # Filtering accoridng to maximal number of allwoed suggestions
66 | if len(supplements_of_interest) > number_suggestions:
67 | supplements_of_interest = supplements_of_interest[0:number_suggestions]
68 | logging.info("Keeping {} potential supplements".format(number_suggestions))
69 | assert len(supplements_of_interest) == number_suggestions
70 | else:
71 | logging.info("Keeping all supplements as there are only {} ({} allowed)".format(len(supplements_of_interest), number_suggestions))
72 |
73 | # Extracting pathways
74 | for supplement_to_extract in supplements_of_interest:
75 | # setting up search
76 | found_pathways = 0
77 | folder_to_save_pathways = "{}/{}".format(folder_to_save, supplement_to_extract.split("-")[0])
78 | if not os.path.exists(folder_to_save_pathways):
79 | os.mkdir(folder_to_save_pathways)
80 | # searching
81 | tree.set_folder_to_save(folder_to_save_pathways)
82 | nodes_to_treat = [tree.root_node]
83 | while nodes_to_treat != []:
84 | node = nodes_to_treat[0]
85 | del nodes_to_treat[0]
86 | state = node.state
87 | supplement = state.GetSupplement_from_InChI_Keys()
88 | if not supplement is None:
89 | if supplement.InChIKey == supplement_to_extract:
90 | found_pathways = found_pathways + 1
91 | found_pathway = tree.extract_pathway_from_bottom(node, iteration=found_pathways)
92 | if node.terminal:
93 | pass
94 | else:
95 | for child in node.children:
96 | nodes_to_treat.append(child)
97 | logging.info("Extract {} pathways for {}".format(found_pathways, supplement_to_extract))
98 |
99 | def __cli():
100 | """
101 | Command line interface.
102 | """
103 |
104 | d = "Arguments for supplement finder. Find compounds that can complete a Tree and be suppelmented to media."
105 | parser = argparse.ArgumentParser(description=d)
106 | parser.add_argument("--tree_to_complete", help="Tree to find supplements to", default="end_search")
107 | parser.add_argument("--folder_tree_to_complete", help="Tree to find supplements to", default=None)
108 |
109 | parser.add_argument("--number_suggestions", default = 20,
110 | help = "Maximum number of suggestions returned")
111 | parser.add_argument("--rescued_states", default = 1,
112 | help = "Minimum number of times the compound must complete states")
113 | parser.add_argument("--folder_to_save", default="testing_supplement_finder")
114 | parser.add_argument("--terminal", help="Default logger is within the new folder_to_save, switch to terminal if specified",
115 | action='store_true', default=False)
116 | parser.add_argument("--database_address", default=None,
117 | help = "Address of a database to check availability. Json format required. Keys are inchikeys. Values are names, but could be cost or any metric of interest")
118 |
119 | args = parser.parse_args()
120 | folder_to_save = args.folder_to_save
121 | if not os.path.exists(folder_to_save):
122 | os.makedirs(folder_to_save, exist_ok=True)
123 |
124 | if args.terminal is True:
125 | logging.basicConfig(
126 | stream = sys.stderr,
127 | level=logging.INFO,
128 | datefmt='%d/%m/%Y %H:%M:%S',
129 | format='%(asctime)s -- %(levelname)s -- %(message)s'
130 | )
131 | else:
132 | logging.basicConfig(
133 | stream = open("{}/{}.log".format(folder_to_save, "supplement_finder"), "w"),
134 | level=logging.INFO,
135 | datefmt='%d/%m/%Y %H:%M:%S',
136 | format='%(asctime)s -- %(levelname)s -- %(message)s'
137 | )
138 | completed_tree = unpickle(file_name=args.tree_to_complete,
139 | type='tree',
140 | folder_address="{}/pickles".format(args.folder_tree_to_complete))
141 | if args.database_address is None:
142 | database = None
143 | else:
144 | with open(args.database_address, "r") as json_file:
145 | database = json.load(json_file)
146 |
147 | run(completed_tree, number_suggestions = args.number_suggestions,
148 | rescued_states =args.rescued_states, folder_to_save = args.folder_to_save,
149 | database = database)
150 |
151 |
152 | if __name__ == "__main__":
153 | __cli()
154 |
--------------------------------------------------------------------------------
/tests/data/state_BOPG_BSAB_GPRL.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/tests/data/state_BOPG_BSAB_GPRL.pkl
--------------------------------------------------------------------------------
/tests/data/tree_pipecolate_test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/tests/data/tree_pipecolate_test.pkl
--------------------------------------------------------------------------------
/tests/generated_jsons/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitkeep
3 | !.gitignore
--------------------------------------------------------------------------------
/tests/generated_jsons/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/tests/generated_jsons/.gitkeep
--------------------------------------------------------------------------------
/tests/test_Filters.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import pytest
3 |
4 | from utilities.chemtools.Filters import Filters
5 | from rdkit.Chem import MolFromSmiles, MolToSmiles
6 | from rdkit.Chem import MolFromInchi, MolToInchi
7 |
8 |
9 | def test_init():
10 | assert Filters()
11 |
12 | def test_copy_properties():
13 | # TODO: add some tests here
14 | pass
15 |
16 | def test_keep_biggest():
17 | mol = Filters.keep_biggest(MolFromSmiles('CCCC.CC'))
18 | assert MolToSmiles(mol) == 'CCCC'
19 | mol = Filters.keep_biggest(MolFromSmiles('CCCCC.CC.[H].CCC'))
20 | assert MolToSmiles(mol) == 'CCCCC'
21 | mol = Filters.keep_biggest(MolFromInchi('InChI=1S/C5H12N2O2.C4H7NO4/c6-3-1-2-4(7)5(8)9;5-2(4(8)9)1-3(6)7/h4H,1-3,6-7H2,(H,8,9);2H,1,5H2,(H,6,7)(H,8,9)/t4-;2-/m00/s1'))
22 | assert MolToInchi(mol) == 'InChI=1S/C4H7NO4/c5-2(4(8)9)1-3(6)7/h2H,1,5H2,(H,6,7)(H,8,9)/t2-/m0/s1'
23 | mol = Filters.keep_biggest(MolFromInchi('InChI=1S/Mo.4O/q;;;2*-1'))
24 | assert MolToInchi(mol) == 'InChI=1S/Mo'
25 |
26 | def test_commute_inchi():
27 | inchi = 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1'
28 | mol = Filters.commute_inchi(MolFromInchi(inchi))
29 | assert MolToInchi(mol) == inchi
30 |
31 | def test_remove_isotope():
32 | mol = Filters.remove_isotope(MolFromSmiles('c1cc[14cH]cc1'))
33 | assert MolToSmiles(mol) == ('c1ccccc1')
34 |
35 | def test_neutralise_charge():
36 | mol = Filters.neutralise_charge(MolFromSmiles('CC(C(=O)[O-])O'))
37 | assert MolToSmiles(mol) == ('CC(O)C(=O)O')
38 |
39 | def test_add_hydrogen():
40 | mol = Filters.add_hydrogen(MolFromSmiles('CC(O)C(=O)O'))
41 | assert MolToSmiles(mol) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]'
42 | mol = Filters.add_hydrogen(MolFromSmiles('CC(C(=O)[O-])O'))
43 | assert MolToSmiles(mol) == '[H]OC([H])(C(=O)[O-])C([H])([H])[H]'
44 |
45 | def test_kekulize():
46 | mol = Filters.kekulize(MolFromSmiles('c1ccccc1'))
47 | assert MolToSmiles(mol) == 'C1=CC=CC=C1'
48 |
49 | def test_remove_stereo():
50 | mol = Filters.remove_stereo(MolFromSmiles('C[C@@H](C(=O)[O-])O'))
51 | assert MolToSmiles(mol) == 'CC(O)C(=O)[O-]'
52 | mol = Filters.remove_stereo(MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'))
53 | assert MolToSmiles(mol) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(O)=Nc2ccccc21'
54 | mol = Filters.commute_inchi(mol) # Expected to change tautomerism
55 | assert MolToSmiles(mol) == 'O=C1NC(C2=CNC3=C2C=C(O)C=C3)=CC1=C1C(=O)NC2=CC=CC=C21'
56 |
--------------------------------------------------------------------------------
/tests/test_Standardizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import pytest
3 |
4 | import inspect
5 | from utilities.chemtools.Standardizer import Standardizer
6 | from utilities.chemtools.Sequences import sequence_tunable
7 | from rdkit.Chem import MolFromSmiles, MolToSmiles
8 | from rdkit.Chem import MolFromInchi, MolToInchi
9 |
10 | def test_init():
11 | def sequence_dummy(mol):
12 | return mol
13 | assert Standardizer()
14 | assert Standardizer(sequence_fun=sequence_dummy)
15 | assert Standardizer(sequence_fun=sequence_dummy, params=dict())
16 |
17 | def test_sequence_minimal():
18 | # Violacein
19 | mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
20 | ans = Standardizer().compute(mol)
21 | assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
22 | assert MolToSmiles(ans) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=C/C1=C1\\C(O)=Nc2ccccc21'
23 | # L-Lactate
24 | mol = MolFromInchi('')
25 |
26 | def test_sequence_rr_legacy():
27 | # Violacein
28 | mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
29 | ans = Standardizer(sequence_fun='sequence_rr_legacy').compute(mol)
30 | assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
31 | assert MolToSmiles(ans) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
32 |
33 | def test_sequence_tunable():
34 | # Check default arguments
35 | args, varargs, varkw, defaults, kwonlyargs, kwonlydefaults, annotations = inspect.getfullargspec(sequence_tunable)
36 | default_params = dict(zip(args[-len(defaults):], defaults))
37 | assert default_params == {
38 | 'OP_REMOVE_ISOTOPE':True,
39 | 'OP_NEUTRALISE_CHARGE': True,
40 | 'OP_REMOVE_STEREO': False,
41 | 'OP_COMMUTE_INCHI': False,
42 | 'OP_KEEP_BIGGEST': True,
43 | 'OP_ADD_HYDROGEN': True,
44 | 'OP_KEKULIZE': True,
45 | 'OP_NEUTRALISE_CHARGE_LATE': True
46 | }
47 | # Violacein, default parameter
48 | mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
49 | ans = Standardizer(sequence_fun='sequence_tunable').compute(mol)
50 | assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
51 | assert MolToSmiles(ans) == '[H]OC1=NC(C2=C([H])N([H])C3=C2C([H])=C(O[H])C([H])=C3[H])=C([H])/C1=C1\\C(O[H])=NC2=C([H])C([H])=C([H])C([H])=C21'
52 | # Violacein, strip stereo
53 | mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
54 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol)
55 | assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)'
56 | assert MolToSmiles(ans) == '[H]OC1=C([H])C2=C(C([H])=C1[H])N([H])C([H])=C2C1=C([H])C(=C2C(=O)N([H])C3=C([H])C([H])=C([H])C([H])=C23)C(=O)N1[H]'
57 | # Violacien, implicit Hs
58 | mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
59 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol)
60 | assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
61 | assert MolToSmiles(ans) == 'OC1=CC2=C(C=C1)NC=C2C1=C/C(=C2/C3=CC=CC=C3N=C2O)C(O)=N1'
62 | # Violacien, no kekulerization
63 | mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
64 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_KEKULIZE': False}).compute(mol)
65 | assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+'
66 | assert MolToSmiles(ans) == '[H]OC1=NC(c2c([H])n([H])c3c([H])c([H])c(O[H])c([H])c23)=C([H])/C1=C1\\C(O[H])=Nc2c([H])c([H])c([H])c([H])c21'
67 | # Violacien, strip stereo & implicit Hs & no kekulerization
68 | mol = MolFromInchi('InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)/b18-13+')
69 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True, 'OP_ADD_HYDROGEN': False, 'OP_KEKULIZE': False}).compute(mol)
70 | assert MolToInchi(ans) == 'InChI=1S/C20H13N3O3/c24-10-5-6-15-12(7-10)14(9-21-15)17-8-13(19(25)23-17)18-11-3-1-2-4-16(11)22-20(18)26/h1-9,21,24H,(H,22,26)(H,23,25)'
71 | assert MolToSmiles(ans) == 'O=C1NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(=O)Nc2ccccc21'
72 | # Lactate, default parameter
73 | mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
74 | ans = Standardizer(sequence_fun='sequence_tunable').compute(mol)
75 | assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1'
76 | assert MolToSmiles(ans) == '[H]OC(=O)[C@@]([H])(O[H])C([H])([H])[H]'
77 | # L-lactate, implicit Hs
78 | mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
79 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False}).compute(mol)
80 | assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/t2-/m0/s1'
81 | assert MolToSmiles(ans) == 'C[C@H](O)C(=O)O'
82 | # L-lactate, no stereo
83 | mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
84 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_REMOVE_STEREO': True}).compute(mol)
85 | assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)'
86 | assert MolToSmiles(ans) == '[H]OC(=O)C([H])(O[H])C([H])([H])[H]'
87 | # L-lactate, no charge neutralisation
88 | mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
89 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_NEUTRALISE_CHARGE': False, 'OP_NEUTRALISE_CHARGE_LATE': False}).compute(mol)
90 | assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)/p-1/t2-/m0/s1'
91 | assert MolToSmiles(ans) == '[H]O[C@]([H])(C(=O)[O-])C([H])([H])[H]'
92 | # L-lactate, implicit Hs & no stereo
93 | mol = MolFromSmiles('C[C@@H](C(=O)[O-])O')
94 | ans = Standardizer(sequence_fun='sequence_tunable', params={'OP_ADD_HYDROGEN': False, 'OP_REMOVE_STEREO': True}).compute(mol)
95 | assert MolToInchi(ans) == 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)'
96 | assert MolToSmiles(ans) == 'CC(O)C(=O)O'
97 |
--------------------------------------------------------------------------------
/tests/test_Tree.py:
--------------------------------------------------------------------------------
1 | """
2 | Aim: test compound features
3 | """
4 |
5 | # General utility packages
6 | import random
7 | import pickle
8 |
9 | # RP3 specific objects
10 | from compound import Compound, unpickle
11 | from chemical_compounds_state import ChemicalCompoundState
12 | from representation import Test_representation, Test_to_file
13 | from organisms import detectable_cmpds_H, Test_organism_H
14 | from organisms import detectable_cmpds_noH
15 | from rewarding import Basic_Rollout_Reward
16 | from MCTS_node import MCTS_node
17 | from UCT_policies import Biochemical_UCT_1, Nature_UCT, Classical_UCT_RAVE, Classical_UCT_with_bias, Classical_UCT
18 | from rule_sets_examples import applicable_rules_mixed_dict, applicable_rules_10_dict
19 | from Tree import Tree
20 | from rule_sets_similarity import get_rules_and_score, full_rules_forward_H, full_rules_retro_H, full_rules_forward_no_H, full_rules_retro_no_H
21 |
22 |
23 |
24 | random.seed(42)
25 |
26 |
27 | class TestTree(object):
28 | def test_equality_statement_not_expanded(self):
29 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
30 | compound = Compound(csmile, name = "821")
31 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
32 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
33 |
34 | test_Tree = Tree(root_state = state, itermax = 100)
35 | test_Tree_bis = Tree(root_state = state_bis, itermax = 100)
36 | assert test_Tree == test_Tree_bis
37 |
38 | def test_equality_statement_expanded(self):
39 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
40 | compound = Compound(csmile, name = "821")
41 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
42 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
43 |
44 | test_Tree = Tree(root_state = state, itermax = 100)
45 | test_Tree_bis = Tree(root_state = state_bis, itermax = 100)
46 | test_Tree.run_search()
47 | test_Tree_bis.run_search()
48 | assert test_Tree == test_Tree_bis
49 |
50 | def test_equality_statement_expanded_differnet_iter(self):
51 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
52 | compound = Compound(csmile, name = "821")
53 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
54 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
55 |
56 | test_Tree = Tree(root_state = state, itermax = 100)
57 | test_Tree_bis = Tree(root_state = state_bis, itermax = 1000)
58 | test_Tree.run_search()
59 | test_Tree_bis.run_search()
60 | assert test_Tree != test_Tree_bis
61 |
62 | def test_equality_statement_expanded_false(self):
63 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
64 | compound = Compound(csmile, name = "821")
65 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
66 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
67 |
68 | test_Tree = Tree(root_state = state, itermax = 100)
69 | test_Tree_bis = Tree(root_state = state_bis, itermax = 100)
70 | test_Tree.run_search()
71 | assert test_Tree != test_Tree_bis
72 |
73 | def test_equality_statement_expanded_states(self):
74 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
75 | compound = Compound(csmile, name = "821")
76 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
77 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
78 |
79 | test_Tree = Tree(root_state = state, itermax = 100, available_rules = applicable_rules_mixed_dict)
80 | test_Tree_bis = Tree(root_state = state_bis, itermax = 500, available_rules = applicable_rules_mixed_dict)
81 | test_Tree.run_search()
82 | test_Tree_bis.run_search()
83 | different_trees = test_Tree != test_Tree_bis
84 | same_states = test_Tree.equality_visited_states(test_Tree_bis)
85 | assert different_trees and same_states
86 |
87 | def test_equality_statement_expanded_states_other_policies(self):
88 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
89 | compound = Compound(csmile, name = "821")
90 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
91 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
92 |
93 | test_Tree = Tree(root_state = state, itermax = 100)
94 | test_Tree_bis = Tree(root_state = state_bis, itermax = 1000, UCT_policy = "Nature_UCT")
95 | test_Tree.run_search()
96 | test_Tree_bis.run_search()
97 | different_trees = test_Tree != test_Tree_bis
98 | same_states = test_Tree.equality_visited_states(test_Tree_bis)
99 | assert different_trees and same_states
100 |
101 | def test_pickling_unpickling(self, tmpdir):
102 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
103 | compound = Compound(csmile, name = "821")
104 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
105 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
106 |
107 | test_Tree = Tree(root_state = state, itermax = 10000, parallel = False,
108 | Rollout_policy = "Rollout_policy_first",
109 | UCT_policy = "Biochemical_UCT_1")
110 | test_Tree.run_search()
111 | test_Tree.save("test", folder_address = tmpdir)
112 | loaded_tree = unpickle(file_name = 'test', type = 'tree', folder_address = tmpdir)
113 | assert test_Tree == loaded_tree
114 |
115 | def test_pickling_unpickling_differ(self, tmpdir):
116 | csmile = "[H][C](=[O])[C]([H])([H])[C]([H])([H])[H]"
117 | compound = Compound(csmile, name = "821")
118 | state = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
119 | state_bis = ChemicalCompoundState([compound], organism = Test_organism_H, representation = Test_representation) # state is not sanitised
120 |
121 | test_Tree = Tree(root_state = state, itermax = 10000, parallel = False,
122 | Rollout_policy = "Rollout_policy_first",
123 | UCT_policy = "Biochemical_UCT_1")
124 | test_Tree.run_search()
125 | test_Tree.save("test", folder_address = tmpdir)
126 | test_Tree.run_search()
127 | loaded_tree = unpickle(file_name = 'test', type = 'tree', folder_address = tmpdir)
128 | assert test_Tree != loaded_tree
129 |
130 | def test_biosensor(self):
131 | organism = detectable_cmpds_H
132 | inchi = "InChI=1S/C6H11NO2/c8-6(9)5-3-1-2-4-7-5/h5,7H,1-4H2,(H,8,9)"
133 | compound = Compound(InChI = inchi, name = "pipecolate")
134 | present_in_state_detectable = organism.compound_in_state(compound)
135 | if present_in_state_detectable:
136 | logging.warning("Removed compound from the detectable set to force enzymatic detection")
137 | organism.remove_cmpd_from_state(compound)
138 | rules, biological_scoring = get_rules_and_score(full_rules_forward_H = full_rules_forward_H,
139 | full_rules_retro_H = full_rules_retro_H,
140 | full_rules_forward_no_H = full_rules_forward_no_H,
141 | full_rules_retro_no_H = full_rules_retro_no_H,
142 | add_Hs = True,
143 | retro = False,
144 | diameters = [10, 12, 14, 16],
145 | small = False,
146 | c_name = None,
147 | filtering_EC = ["1.5.3.7", "1.5.3"])
148 | state = ChemicalCompoundState([compound]) # state is not sanitised
149 | test_Tree = Tree(root_state = state, itermax = 1000, parallel = False,
150 | Rollout_policy = "Rollout_policy_first",
151 | UCT_policy = "Biochemical_UCT_1", available_rules = rules, organism = organism,
152 | biological_scorer = biological_scoring,
153 | folder_to_save = "tests/generated_jsons")
154 | test_Tree.run_search()
155 | loaded_tree = unpickle(file_name = 'pipecolate_test', type = 'tree', folder_address = "tests/data")
156 | same_states = test_Tree.equality_visited_states(loaded_tree)
157 | assert same_states
158 |
--------------------------------------------------------------------------------
/tests/test_Utils.py:
--------------------------------------------------------------------------------
1 | import rdkit
2 | from rdkit import Chem
3 | from rdkit.Chem import AllChem
4 | import pytest
5 |
6 |
7 | from utilities.reactor.Utils import standardize_chemical, standardize_results, handle_results
8 |
9 |
10 | class TestBasic2(object):
11 |
12 | def test_standardize_chemical_1(self):
13 | rdmol = Chem.MolFromSmiles('[H][O][C](=[O])[C]([H])([O][H])[C]([H])([H])[H]')
14 | rdmol_std_1 = standardize_chemical(rdmol, add_hs=False)
15 | assert Chem.MolToSmiles(rdmol_std_1) == 'CC(O)C(=O)O'
16 | rdmol_std_2 = standardize_chemical(rdmol, add_hs=True)
17 | assert Chem.MolToSmiles(rdmol_std_2, allHsExplicit=True) == '[H][O][C](=[O])[C]([H])([O][H])[C]([H])([H])[H]'
18 |
19 | def test_standardize_chemical_2(self):
20 | # Data
21 | violacein_smiles = 'OC1=NC(=C\\C1=C1/C(O)=NC2=CC=CC=C12)C1=CNC2=C1C=C(O)C=C2'
22 | violacein_mol = Chem.MolFromSmiles(violacein_smiles, sanitize=False)
23 | # Test simplest case
24 | std_mol_1 = standardize_chemical(violacein_mol, add_hs=False, rm_stereo=False)
25 | assert Chem.MolToSmiles(std_mol_1) == 'OC1=NC(c2c[nH]c3ccc(O)cc23)=C/C1=C1\\C(O)=Nc2ccccc21'
26 | # Test adding Hs
27 | std_mol_2 = standardize_chemical(violacein_mol, add_hs=True, rm_stereo=False)
28 | assert Chem.MolToSmiles(std_mol_2) == '[H]OC1=NC(c2c([H])n([H])c3c([H])c([H])c(O[H])c([H])c23)=C([H])/C1=C1\\C(O[H])=Nc2c([H])c([H])c([H])c([H])c21'
29 | # Test removing stereo
30 | std_mol_3 = standardize_chemical(violacein_mol, add_hs=False, rm_stereo=True)
31 | assert Chem.MolToSmiles(std_mol_3) == 'O=C1NC(c2c[nH]c3ccc(O)cc23)=CC1=C1C(=O)Nc2ccccc21'
32 | # Test adding Hs + removing stereo
33 | std_mol_4 = standardize_chemical(violacein_mol, add_hs=True, rm_stereo=True)
34 | assert Chem.MolToSmiles(std_mol_4) == '[H]Oc1c([H])c([H])c2c(c1[H])c(C1=C([H])C(=C3C(=O)N([H])c4c([H])c([H])c([H])c([H])c43)C(=O)N1[H])c([H])n2[H]'
35 |
36 | def test_standardize_chemical_3(self):
37 | # Data
38 | wrong_smiles = '[H]OC(=O)C([H])([H])C([H])([H])C([H])(N=C(O[H])C([H])([H])C([H])([H])C([H])(N=C(O[H])C([H])(OP(=O)(O[H])OC([H])([H])C([H])(O[H])C([H])(O[H])C([H])(O[H])C([H])([H])n1c2nc(=O)nc(O[H])c-2c([H])c2c([H])c([H])c(OP(=O)(OC([H])([H])C(C([H])([H])[H])(C([H])([H])[H])C([H])(O[H])C(=NC([H])([H])C([H])([H])C(=NC([H])([H])C([H])([H])SC(=O)C([H])([H])C([H])([H])C([H])([H])C([H])(C(=C([H])[H])C([H])([H])[H])C([H])([H])C(=O)O[H])O[H])O[H])OP(=O)(O[H])OC([H])([H])C3([H])OC([H])(n4[c]([H])n([H])[c]5[c](N([H])[H])[n][c]([H])[n][c]54)C([H])(O[H])C3([H])OP(=O)(O[H])O[H])c([H])c21)C([H])([H])[H])C(=O)O[H])C(=O)O[H]'
39 | # Test
40 | wrong_mol = Chem.MolFromSmiles(wrong_smiles, sanitize=False)
41 | with pytest.raises(Exception):
42 | standardize_chemical(wrong_mol)
43 |
44 | def test_standardize_results_1(self):
45 | tuple_tuple_raw = ((
46 | Chem.MolFromSmiles('[H][O][C](=[O])[C]([H])([O][P](=[O])([O][H])[O][H])[C]([H])([H])[H]'),
47 | Chem.MolFromSmiles('[H][N]=[c]1[n][c]([O][H])[c]2[n][c]([H])[n]([C]3([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][H])[C]([H])([O][H])[C]3([H])[O][H])[c]2[n]1[H]')
48 | ),(
49 | Chem.MolFromInchi('InChI=1S/C5H6N5O/c6-5-9-3-2(4(11)10-5)7-1-8-3/h1H,9H2,(H,7,8)(H2,6,10,11)')
50 | ))
51 | tuple_tuple_rdmol, tuple_index_failed = standardize_results(tuple_tuple_raw, add_hs=True, rm_stereo=True)
52 | assert len(tuple_tuple_rdmol) == 1
53 | assert tuple_index_failed == [1]
54 |
55 | def test_handle_result(self):
56 | tuple_raw = (
57 | Chem.MolFromSmiles('[H][O][C](=[O])[C]([H])([O][P](=[O])([O][H])[O][H])[C]([H])([H])[H]'),
58 | Chem.MolFromSmiles('[H][N]=[c]1[n][c]([O][H])[c]2[n][c]([H])[n]([C]3([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][H])[C]([H])([O][H])[C]3([H])[O][H])[c]2[n]1[H]')
59 | )
60 | tuple_tuple_rdmol, tuple_tuple_failed = standardize_results(tuple_tuple_rdmol=(tuple_raw,), add_hs=True, rm_stereo=True)
61 | inchikeys, inchis, smiles = handle_results(list_list_rdmol=tuple_tuple_rdmol)
62 | # Check number products
63 | assert len(inchikeys) == len(inchis) == len(smiles) == 1 # Only one set of result
64 | assert len(inchikeys[0]) == len(inchis[0]) == len(smiles[0]) == 2 # 2 products
65 | # Check Inchikeys
66 | assert inchikeys[0][0] == 'CSZRNWHGZPKNKY-UHFFFAOYSA-N'
67 | assert inchikeys[0][1] == 'QGWNDRXFNXRZMB-UHFFFAOYSA-N'
68 | # Check Inchis
69 | assert inchis[0][0] == 'InChI=1S/C3H7O6P/c1-2(3(4)5)9-10(6,7)8/h2H,1H3,(H,4,5)(H2,6,7,8)'
70 | assert inchis[0][1] == 'InChI=1S/C10H15N5O11P2/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(25-9)1-24-28(22,23)26-27(19,20)21/h2-3,5-6,9,16-17H,1H2,(H,22,23)(H2,19,20,21)(H3,11,13,14,18)'
71 | # Check SMILES #1
72 | assert smiles[0][0] == '[H]OC(=O)C([H])(OP(=O)(O[H])O[H])C([H])([H])[H]'
73 | rdmol = Chem.MolFromSmiles(smiles[0][0])
74 | rdmol = Chem.AddHs(rdmol)
75 | assert Chem.MolToSmiles(rdmol, allHsExplicit=True) == '[H][O][C](=[O])[C]([H])([O][P](=[O])([O][H])[O][H])[C]([H])([H])[H]'
76 | # Check SMILES #2
77 | assert smiles[0][1] == '[H]N=c1nc(O[H])c2nc([H])n(C3([H])OC([H])(C([H])([H])OP(=O)(O[H])OP(=O)(O[H])O[H])C([H])(O[H])C3([H])O[H])c2n1[H]'
78 | rdmol = Chem.MolFromSmiles(smiles[0][1])
79 | rdmol = Chem.AddHs(rdmol)
80 | assert Chem.MolToSmiles(rdmol, allHsExplicit=True) == '[H][N]=[c]1[n][c]([O][H])[c]2[n][c]([H])[n]([C]3([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][H])[C]([H])([O][H])[C]3([H])[O][H])[c]2[n]1[H]'
81 |
--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
1 | """
2 | Test FireBurner class
3 | """
4 |
5 | import rdkit
6 | from rdkit import Chem
7 | import pytest
8 | import multiprocessing
9 |
10 |
11 | from utilities.reactor.cli import RuleBurner, RuleConversionError, ChemConversionError
12 |
13 |
14 | # Data for tests
15 | substate_inchi = 'InChI=1S/C3H6O3/c1-2(4)3(5)6/h2,4H,1H3,(H,5,6)'
16 | reaction_smarts = '([#8&v2:1](-[#6&v4:2](-[#6&v4:3](-[#8&v2:4]-[#1&v1:5])=[#8&v2:6])(-[#6&v4:7](-[#1&v1:8])(-[#1&v1:9])-[#1&v1:10])-[#1&v1:11])-[#1&v1:12])>>([#15&v5](=[#8&v2])(-[#8&v2]-[#1&v1])(-[#8&v2]-[#1&v1])-[#8&v2:1]-[#6&v4:2](-[#6&v4:3](-[#8&v2:4]-[#1&v1:5])=[#8&v2:6])(-[#6&v4:7](-[#1&v1:8])(-[#1&v1:9])-[#1&v1:10])-[#1&v1:11].[#7&v3](=[#6&v4]1:[#7&v3]:[#6&v4](-[#8&v2]-[#1&v1]):[#6&v4]2:[#7&v3]:[#6&v4](-[#1&v1]):[#7&v3](-[#6&v4]3(-[#1&v1])-[#8&v2]-[#6&v4](-[#6&v4](-[#8&v2]-[#15&v5](=[#8&v2])(-[#8&v2]-[#1&v1])-[#8&v2]-[#15&v5](-[#8&v2]-[#1&v1:12])(=[#8&v2])-[#8&v2]-[#1&v1])(-[#1&v1])-[#1&v1])(-[#1&v1])-[#6&v4](-[#8&v2]-[#1&v1])(-[#1&v1])-[#6&v4]-3(-[#8&v2]-[#1&v1])-[#1&v1]):[#6&v4]:2:[#7&v3]:1-[#1&v1])-[#1&v1])'
17 | tuple_product_inchikeys = ('CSZRNWHGZPKNKY-UHFFFAOYSA-N', 'QGWNDRXFNXRZMB-UHFFFAOYSA-N')
18 | tuple_product_smiles = ('[H][O][C](=[O])[C]([H])([O][P](=[O])([O][H])[O][H])[C]([H])([H])[H]', '[H][N]=[c]1[n][c]([O][H])[c]2[n][c]([H])[n]([C]3([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][H])[C]([H])([O][H])[C]3([H])[O][H])[c]2[n]1[H]')
19 | tuple_product_inchis = ('InChI=1S/C3H7O6P/c1-2(3(4)5)9-10(6,7)8/h2H,1H3,(H,4,5)(H2,6,7,8)', 'InChI=1S/C10H15N5O11P2/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(25-9)1-24-28(22,23)26-27(19,20)21/h2-3,5-6,9,16-17H,1H2,(H,22,23)(H2,19,20,21)(H3,11,13,14,18)')
20 |
21 |
22 | def dummy_worker(**kwargs):
23 | import time
24 | time.sleep(1)
25 |
26 |
27 | def test_init():
28 | # Empty is OK
29 | rb = RuleBurner(rsmarts_list=[], inchi_list=[]) # Empty is OK
30 | rb.compute()
31 |
32 |
33 | def test_run_with_timeout():
34 | rb = RuleBurner(rsmarts_list=[], inchi_list=[])
35 | with pytest.raises(multiprocessing.context.TimeoutError):
36 | rb._run_with_timeout(dummy_worker, None, timeout=0)
37 | rb._run_with_timeout(dummy_worker, None, timeout=2)
38 |
39 |
40 | def test_jsonify():
41 | rb = RuleBurner(rsmarts_list=[], inchi_list=[])
42 | assert rb._jsonify(rsmarts='', inchi='', rid='RID', cid='CID').replace('\n', '') == """{ "rule_id": "RID", "substrate_id": "CID", "fire_timed_out": null, "fire_exec_time": null}"""
43 |
44 |
45 | def test_compute():
46 | # Wrong reaction depiction
47 | rb = RuleBurner(rsmarts_list=['DUMMY'], inchi_list=[])
48 | with pytest.raises(RuleConversionError):
49 | rb.compute()
50 | # Wrong chemical depiction
51 | rb = RuleBurner(rsmarts_list=[reaction_smarts], inchi_list=['DUMMY'])
52 | with pytest.raises(ChemConversionError):
53 | rb.compute()
54 | # Timeout should be logged
55 | rb = RuleBurner(rsmarts_list=[reaction_smarts], inchi_list=[substate_inchi], fire_timeout=0)
56 | rb.compute()
57 | assert ''.join(rb._json).find('"fire_timed_out": true')
58 | # OK
59 | rb = RuleBurner(rsmarts_list=[reaction_smarts], inchi_list=[substate_inchi])
60 | rb.compute()
61 | assert ''.join(rb._json).find('InChI=1S/C3H7O6P/c1-2(3(4)5)9-10(6,7)8/h2H,1H3,(H,4,5)(H2,6,7,8)')
62 | assert ''.join(rb._json).find('InChI=1S/C10H15N5O11P2/c11-10-13-7-4(8(18)14-10)12-2-15(7)9-6(17)5(16)3(25-9)1-24-28(22,23)26-27(19,20)21/h2-3,5-6,9,16-17H,1H2,(H,22,23)(H2,19,20,21)(H3,11,13,14,18)')
63 |
--------------------------------------------------------------------------------
/tests/test_moves.py:
--------------------------------------------------------------------------------
1 | """
2 | Aim: test compound features
3 | """
4 |
5 | # RP3 objects
6 | from compound import Compound
7 | from move import Move
8 |
9 | class TestMove(object):
10 | """
11 | Testing moves - should be fast
12 | """
13 | def test_cloning(self):
14 | move = Move(rsmart = "rsmart",
15 | rid = "rid",
16 | compound_id= "compound_id")
17 | cloned_move = move.clone()
18 | different_python_object = (id(move) != id(cloned_move))
19 | identical_move_object = move.eq_full_inchi_key(cloned_move)
20 | assert (different_python_object and identical_move_object)
21 |
22 | def test_equality_true(self):
23 | compound_1 = Compound("[H+]")
24 | compound_6 = Compound("[H][N]=[C]([O][H])[C]1=[C]([H])[N]([C]2([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][C]([H])([H])[C]3([H])[O][C]([H])([n]4[c]([H])[n][c]5[c]([N]([H])[H])[n][c]([H])[n][c]54)[C]([H])([O][P](=[O])([O][H])[O][H])[C]3([H])[O][H])[C]([H])([O][H])[C]2([H])[O][H])[C]([H])=[C]([H])[C]1([H])[H]")
25 | compound_2345 = Compound("[H][C](=[O])[C]([H])=[C]([H])[H]")
26 | move = Move(rsmart = "rsmart",
27 | rid = "rid",
28 | compound_id= "compound_id",
29 | product_list = [compound_1, compound_6],
30 | set_number = 5)
31 | move_bis = Move(rsmart = "rsmart",
32 | rid = "rid",
33 | compound_id= "compound_id",
34 | product_list = [compound_6, compound_1])
35 |
36 | assert move.eq_full_inchi_key(move_bis)
37 |
38 | def test_equality_false(self):
39 | compound_1 = Compound("[H+]")
40 | compound_6 = Compound("[H][N]=[C]([O][H])[C]1=[C]([H])[N]([C]2([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][C]([H])([H])[C]3([H])[O][C]([H])([n]4[c]([H])[n][c]5[c]([N]([H])[H])[n][c]([H])[n][c]54)[C]([H])([O][P](=[O])([O][H])[O][H])[C]3([H])[O][H])[C]([H])([O][H])[C]2([H])[O][H])[C]([H])=[C]([H])[C]1([H])[H]")
41 | compound_2345 = Compound("[H][C](=[O])[C]([H])=[C]([H])[H]")
42 | move = Move(rsmart = "rsmart",
43 | rid = "rid",
44 | compound_id= "compound_id",
45 | product_list = [compound_1, compound_6])
46 | move_bis = Move(rsmart = "rsmart",
47 | rid = "rid",
48 | compound_id= "compound_id",
49 | product_list = [compound_6, compound_1, compound_2345])
50 | move_ter = Move(rsmart = "rsmart",
51 | rid = "rid",
52 | compound_id= "compound_id_2",
53 | product_list = [compound_6, compound_1])
54 |
55 | assert move != move_bis and move != move_ter and move_bis != move_ter
56 |
57 | def test_rave_update(self):
58 | compound_1 = Compound("[H+]")
59 | compound_6 = Compound("[H][N]=[C]([O][H])[C]1=[C]([H])[N]([C]2([H])[O][C]([H])([C]([H])([H])[O][P](=[O])([O][H])[O][P](=[O])([O][H])[O][C]([H])([H])[C]3([H])[O][C]([H])([n]4[c]([H])[n][c]5[c]([N]([H])[H])[n][c]([H])[n][c]54)[C]([H])([O][P](=[O])([O][H])[O][H])[C]3([H])[O][H])[C]([H])([O][H])[C]2([H])[O][H])[C]([H])=[C]([H])[C]1([H])[H]")
60 | move = Move(rsmart = "rsmart",
61 | rid = "rid",
62 | compound_id= "compound_id",
63 | product_list = [compound_1, compound_6])
64 |
65 | move.update(5, visit_number = 10)
66 | move.update(0.2, 10)
67 | assert move.RAVE_total_score == 52
68 | assert move.RAVE_visits == 20
69 |
70 | # def more_compelx_tests_wthi_compouns
71 |
--------------------------------------------------------------------------------
/tests/tree_test.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brsynth/RetroPathRL/7de91f0236cf3c3dfc2c0455bd7dbcee9f715d2f/tests/tree_test.pkl
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
--------------------------------------------------------------------------------
/tree_viewer.py:
--------------------------------------------------------------------------------
1 | """
2 | Contains the tree objects for visualisation and export
3 | """
4 |
5 | # General utility packages
6 | import logging
7 | import csv
8 | import copy
9 | import json
10 | import sys
11 |
12 | # RP3 specific objects
13 | from compound import Compound
14 | from move import Move
15 | from chemical_compounds_state import ChemicalCompoundState
16 | from MCTS_node import MCTS_node
17 | # General configuration
18 | from config import *
19 |
20 | class Tree_viewer(object):
21 | """
22 | Tree_viewer object.
23 | Has methods for quick visualisation as well as export to json
24 | """
25 | logger = logging.getLogger(__name__)
26 |
27 | def __init__(self,
28 | file_to_save = "temporary_tree_viewer_json"):
29 | """
30 | Initialising a tree viewer object.
31 | A Node has:
32 | - level
33 | - scores (total and average)
34 | - visits
35 | - terminal
36 | - root
37 | - a chemical state
38 | - the id will be the chemical state and a number to id it
39 | - whether it has a solved child
40 | A Move:
41 | - Biological score
42 | - Chemical score
43 | - EC numbers
44 | - compound ID it applyes to
45 | - smarts
46 | - name
47 | An edge links both
48 | """
49 | # Where to save the json
50 | self.file_to_save = file_to_save
51 | # For tree viewer json
52 | self.nodes_nodes = []
53 | self.nodes_transformations = []
54 | self.edges = []
55 |
56 | def set_file_to_save(self, file_to_save):
57 | self.file_to_save = file_to_save
58 |
59 | def add_node(self, node):
60 | """
61 | Adding a node object to the tree.
62 | """
63 | if node.terminal:
64 | terminal = 1
65 | else:
66 | terminal = 0
67 | if node.move is None:
68 | root = 1
69 | else:
70 | root = 0
71 | node_dict = {
72 | 'type': 'node',
73 | 'id': "node_{}".format(node.id),
74 | 'level': node.level,
75 | 'root': root,
76 | 'terminal': terminal,
77 | 'Names': str(node.state), # If I want synonyms, keep them
78 | 'average_score': node.average_score,
79 | 'total_score': node.total_score,
80 | 'visits': node.visits,
81 | 'solved_child': node.has_a_solved_child
82 | }
83 | self.nodes_nodes.append({"data": node_dict})
84 |
85 | if not node.move is None:
86 | move_to_child = {
87 | "target" : "move_{}".format(node.move.id),
88 | "source" : "node_{}".format(node.id),
89 | "id" : "{}_=>_{}".format("move_{}".format(node.move.id), "node_{}".format(node.id))
90 | }
91 | self.edges.append({"data": move_to_child})
92 | if use_transpositions:
93 | parent_nodes = transposition_table[node.parent.hash]
94 | for parent in parent_nodes:
95 | parent_to_move = {
96 | "target" : "node_{}".format(parent.id),
97 | "source" : "move_{}".format(node.move.id),
98 | "id" : "{}_=>_{}".format("node_{}".format(parent.id), "move_{}".format(node.move.id))
99 | }
100 | self.edges.append({"data": parent_to_move})
101 | else:
102 | parent_to_move = {
103 | "target" : "node_{}".format(node.parent.id),
104 | "source" : "move_{}".format(node.move.id),
105 | "id" : "{}_=>_{}".format("node_{}".format(node.parent.id), "move_{}".format(node.move.id))
106 | }
107 | self.edges.append({"data": parent_to_move})
108 | biological_score = node.move.biological_score
109 | try:
110 | diameter = int(node.move.rid.split("-")[3])
111 | except:
112 | diameter = 42
113 | move_dict = {
114 | 'type': 'move',
115 | 'id': "move_{}".format(node.move.id),
116 | "Rule ID": node.move.synonyms,
117 | "EC number": node.move.EC_numbers,
118 | "Reaction SMILES": node.move.rsmiles,
119 | "Diameter": diameter,
120 | "Score": biological_score,
121 | "ChemicalScore": node.move.chemical_score,
122 | "Name": node.move.name
123 | }
124 | self.nodes_transformations.append({"data": move_dict})
125 |
126 | def jsonify_tree_viewer(self):
127 | """
128 | Use scope viewer to visualise pathways before the DBTL advances more.
129 | THe json file is a dict composed of one item called elements.
130 | The elements values is a dict composed of "nodes" and "edges"
131 | Nodes is a list of compounds, or reactions, with:
132 | """
133 | pathway_as_dict = {"elements": {"nodes": self.nodes_nodes + self.nodes_transformations,
134 | "edges": self.edges}}
135 | with open(self.file_to_save, "w") as json_handler:
136 | json.dump(pathway_as_dict, json_handler, indent = 2)
137 |
138 |
139 | def __cli():
140 | """Command line interface. Was actually used to make quick
141 | tests before implementing them in the testing file"""
142 | print("CLI is not available for this module - tree viewing is automatically generated by Tree module")
143 |
144 |
145 | if __name__ == "__main__":
146 | __cli()
147 |
--------------------------------------------------------------------------------
/utilities/chemtools/Filters.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Set of filters to be used for chemical standardisation
4 |
5 | @author: Baudoin Delepine, 2016-2017
6 | @author: Thomas Duigou, 2018-2019
7 | """
8 |
9 | from copy import deepcopy
10 | from rdkit.Chem import AddHs, GetMolFrags, Kekulize, MolToInchi, MolFromInchi, MolFromSmarts, MolFromSmiles, RemoveStereochemistry, MolToSmiles, RemoveHs
11 | from rdkit.Chem.AllChem import Compute2DCoords, ReplaceSubstructs
12 | from rdkit.Chem.Descriptors import MolWt
13 |
14 |
15 | class Filters(object):
16 | """Set of filters to be used for chemical standardization.
17 | """
18 |
19 | @classmethod
20 | def _copy_properties(cls, mol_from, mol_to):
21 | """Copy properties from a RDKit compound to another one.
22 |
23 | :param mol_from: RDKit Mol source object
24 | :param mol_to: RDKit Mol target object
25 |
26 | Warning: aside from chemical's name, all private properties are lost.
27 | """
28 | # NB: name is stored in its default location which is "_Name" and
29 | # is a private propertie.
30 | property_list = mol_from.GetPropNames(includePrivate=False)
31 | if mol_from.HasProp('_Name'): # TD: If _Name is set always save name
32 | property_list.append("_Name")
33 | for property_name in property_list:
34 | mol_to.SetProp(property_name, mol_from.GetProp(property_name))
35 |
36 | @classmethod
37 | def keep_biggest(cls, mol_in):
38 | """Strip small fragments from compound.
39 |
40 | Returns a new compound where only the "biggest" fragment is conserved
41 | according to (i) the number of non-Hs atoms and if there is tie then
42 | according to (ii) the molecular weight.
43 |
44 | :param mol_in: RDKit Mol
45 | :return mol_out: new RDKit Mol having only one connected component
46 | """
47 | def count_non_hs_atom(mol):
48 | ans = 0
49 | for atm in mol.GetAtoms():
50 | if atm.GetAtomicNum() != 1:
51 | ans += 1
52 | return ans
53 | # Remove "other" molecules
54 | molfrag = GetMolFrags(mol_in, asMols=True, sanitizeFrags=False)
55 | mol_out = mol_in
56 | if len(molfrag) > 1:
57 | accepted_nbr_atm = 0 # flag number of atoms in fragment
58 | accepted_mw = 0 # flag the molecular weight of the biggest fragment
59 | for f in molfrag:
60 | nbr_atm = count_non_hs_atom(f)
61 | if nbr_atm > accepted_nbr_atm or (nbr_atm == accepted_nbr_atm and MolWt(f) > accepted_mass):
62 | accepted_nbr_atm = nbr_atm
63 | accepted_mass = MolWt(f)
64 | mol_out = f # keep only the biggest fragment
65 | cls._copy_properties(mol_in, mol_out) # save the name and stuff
66 | return mol_out
67 |
68 | @classmethod
69 | def commute_inchi(cls, mol_in):
70 | """Convert RDKit compound back and forth to InChi.
71 |
72 | Returns a new compound after the initial one has been converted
73 | back and forth to InChi.
74 |
75 | :param mol_in: RDKit Mol
76 | :return mol_out: RDKit Mol
77 | """
78 | inchi = MolToInchi(mol_in, logLevel=None) # this is talkative...
79 | mol_out = MolFromInchi(inchi, sanitize=False, removeHs=False,
80 | logLevel=None, treatWarningAsError=False)
81 | if not mol_out:
82 | raise ValueError("Failed InChi validity filter.")
83 | # Copy the properties
84 | cls._copy_properties(mol_in, mol_out)
85 | return mol_out
86 |
87 | @classmethod
88 | def remove_isotope(cls, mol_in):
89 | """Strip all isotope information.
90 |
91 | Returns a new compound.
92 |
93 | :param mol_in: RDKit Mol
94 | :return mol_out: RDKit Mol
95 | """
96 | mol_out = deepcopy(mol_in) # copy it, just for consistency with other filters
97 | for atm in mol_out.GetAtoms():
98 | atm.SetIsotope(0)
99 | if not mol_out:
100 | raise ValueError("Failed isotope removing filter.")
101 | return mol_out
102 |
103 | @staticmethod
104 | def _rules_rdkit():
105 | patts = (
106 | ('[n+;H]', 'n'), # Imidazoles
107 | ('[N+;!H0]', 'N'), # Amines
108 | ('[$([O-]);!$([O-][#7])]', 'O'), # Carboxylic acids and alcohols
109 | ('[S-;X1]', 'S'), # Thiols
110 | ('[$([N-;X2]S(=O)=O)]', 'N'), # Sulfonamides
111 | ('[$([N-;X2][C,N]=C)]', 'N'), # Enamines
112 | ('[n-]', '[nH]'), # Tetrazoles
113 | ('[$([S-]=O)]', 'S'), # Sulfoxides
114 | ('[$([N-]C=O)]', 'N'), # Amides
115 | )
116 | return [(MolFromSmarts(x), MolFromSmiles(y, False)) for x, y in patts]
117 |
118 | @staticmethod
119 | def _rules_molvs():
120 | """Rules to neutralize compounds. Inspired by molvs."""
121 | ans = {}
122 | # Neutralizable positive charge (with hydrogens attached)
123 | # ans["pos_h"] = Chem.MolFromSmarts('[+!H0!$(*~[-])]')
124 | ans["pos_h"] = MolFromSmarts('[+!H0]')
125 | # Non-neutralizable positive charge (no hydrogens attached)
126 | # ans["pos_quat"] = Chem.MolFromSmarts('[+H0!$(*~[-])]')
127 | # Negative charge, not bonded to a positive charge with no hydrogens
128 | # ans["neg"] = Chem.MolFromSmarts('[-!$(*~[+H0])]')
129 | ans["neg"] = MolFromSmarts('[-]')
130 | # Negative oxygen bonded to [C,P,S]=O, negative aromatic nitrogen?
131 | # ans["neg_acid"] = Chem.MolFromSmarts('[$([O-][C,P,S]=O),$([n-]1nnnc1),$(n1[n-]nnc1)]')
132 | return ans
133 |
134 | @classmethod
135 | def _neutralise_charge_method1(cls, mol_in, rules=None):
136 | """Neutralise charges according to a set of predefined rules.
137 |
138 | From:
139 | http://www.rdkit.org/docs/Cookbook.html#neutralizing-charged-molecules
140 | """
141 | # Fallback to default rules if none are provided
142 | if rules is None:
143 | fun_rules = cls._rules_rdkit
144 |
145 | # Check if rules are already initialised as an attribute
146 | if not hasattr(rules, "rules"):
147 | fun_rules.rules = fun_rules()
148 |
149 | # Apply rules
150 | # Better to use ReplaceSubstructs than RunReactant: the latter would give
151 | # several products (or we would need to use HasSubstructMatch anyway).
152 | for reactant, product in fun_rules.rules:
153 | while mol_in.HasSubstructMatch(reactant):
154 | rms = ReplaceSubstructs(mol_in, reactant, product)
155 | mol_in = rms[0]
156 | mol_in.UpdatePropertyCache()
157 | return mol_in
158 |
159 | @classmethod
160 | def _neutralise_charge_method2(cls, mol_in):
161 | """Neutralise charges as much as possible playing on hydrogens.
162 |
163 | You should sanitize the compounds after this operation.
164 |
165 | From:
166 | http://molvs.readthedocs.io/en/latest/_modules/molvs/charge.html
167 | """
168 | mol_out = deepcopy(mol_in) # copy it, just for consistency with other operations
169 | mol_out.UpdatePropertyCache(strict=False) # recompute implicit valence
170 | # Check if rules are already initialised as an attribute
171 | if not hasattr(cls._rules_molvs, "rules"):
172 | cls._rules_molvs.rules = cls._rules_molvs()
173 | # Get atom ids for matches
174 | p = [x[0] for x in mol_out.GetSubstructMatches(cls._rules_molvs.rules['pos_h'])]
175 | # q = [x[0] for x in cc.GetSubstructMatches(cls._rules_molvs.rules['pos_quat'])]
176 | n = [x[0] for x in mol_out.GetSubstructMatches(cls._rules_molvs.rules['neg'])]
177 | # a = [x[0] for x in cc.GetSubstructMatches(cls._rules_molvs.rules['neg_acid'])]
178 | # Neutralize negative charges
179 | # if q:
180 | # # Surplus negative charges more than non-neutralizable positive charges
181 | # neg_surplus = len(n) - len(q)
182 | # if a and neg_surplus > 0:
183 | # # zwitterion with more negative charges than quaternary positive centres
184 | # while neg_surplus > 0 and a:
185 | # # Add hydrogen to first negative acid atom, increase formal charge
186 | # # Until quaternary positive == negative total or no more negative acid
187 | # atom = cc.GetAtomWithIdx(a.pop(0))
188 | # atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1)
189 | # atom.SetFormalCharge(atom.GetFormalCharge() + 1)
190 | # neg_surplus -= 1
191 | # Finish of neutralization of negative charges (we don't care for zwitterion)
192 | for atom in [mol_out.GetAtomWithIdx(x) for x in n]:
193 | while atom.GetFormalCharge() < 0:
194 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() + 1)
195 | atom.SetFormalCharge(atom.GetFormalCharge() + 1)
196 | # Neutralize positive charges
197 | for atom in [mol_out.GetAtomWithIdx(x) for x in p]:
198 | # Remove hydrogen and reduce formal charge until neutral or no more hydrogens
199 | while atom.GetFormalCharge() > 0 and atom.GetTotalNumHs() > 0:
200 | atom.SetFormalCharge(atom.GetFormalCharge() - 1)
201 | if atom.GetNumExplicitHs() > 0:
202 | atom.SetNumExplicitHs(atom.GetNumExplicitHs() - 1)
203 | return mol_out
204 |
205 | @classmethod
206 | def neutralise_charge(cls, mol_in):
207 | """Neutralise charges.
208 |
209 | :param mol_in: RDKit Mol
210 | :return mol_out: RDKit Mol
211 | """
212 | return cls._neutralise_charge_method1(mol_in)
213 | # return cls._neutralise_charge_method2(mol_in)
214 |
215 | @classmethod
216 | def add_hydrogen(cls, mol_in, addCoords=True):
217 | """Explicit all hydrogens.
218 |
219 | :param mol_in: RDKit Mol
220 | :param addCoords: Add coordinate to added Hs, bool
221 | :return mol_out: RDKit Mol
222 | """
223 | return AddHs(mol_in, explicitOnly=False, addCoords=addCoords)
224 |
225 | @classmethod
226 | def remove_hydrogen(cls, mol_in, addCoords=True):
227 | """Implicit all hydrogens.
228 |
229 | :param mol_in: RDKit Mol
230 | :param addCoords: Add coordinate to added Hs, bool
231 | :return mol_out: RDKit Mol
232 | """
233 | return RemoveHs(mol_in, explicitOnly=False, addCoords=addCoords)
234 |
235 | @classmethod
236 | def kekulize(cls, mol_in):
237 | """Kekulize compound.
238 |
239 | :param mol_in: RDKit Mol
240 | :return mol_out: RDKit Mol
241 | """
242 | mol_out = deepcopy(mol_in)
243 | Kekulize(mol_out, clearAromaticFlags=True)
244 | return mol_out
245 |
246 | @classmethod
247 | def remove_stereo(cls, mol_in):
248 | """Wild stereo removal.
249 |
250 | Warning: need a back and forth Inchi export/import to normalise tautomer
251 |
252 | :param mol_in: RDKit mol
253 | :return mol_out: RDKit mol
254 | """
255 | mol_out = deepcopy(mol_in)
256 | RemoveStereochemistry(mol_out)
257 | return mol_out
258 |
--------------------------------------------------------------------------------
/utilities/chemtools/Sequences.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Sequences of filters to be used for standardization."""
3 |
4 |
5 | from utilities.chemtools.Filters import Filters
6 | from rdkit.Chem import Cleanup, SanitizeMol, SanitizeFlags
7 | from rdkit.Chem.AllChem import AssignStereochemistry
8 |
9 |
10 | def sequence_rr_legacy(mol):
11 | """Sequence of filters applied for the first version of RetroRules
12 | """
13 | F = Filters()
14 | Cleanup(mol)
15 | SanitizeMol(mol, sanitizeOps=SanitizeFlags.SANITIZE_ALL, catchErrors=False)
16 | AssignStereochemistry(mol, cleanIt=True, force=True, flagPossibleStereoCenters=True) # Fix bug TD201904.01
17 | mol = F.remove_isotope(mol)
18 | mol = F.neutralise_charge(mol)
19 | SanitizeMol(mol, sanitizeOps=SanitizeFlags.SANITIZE_ALL, catchErrors=False)
20 | mol = F.keep_biggest(mol)
21 | mol = F.add_hydrogen(mol, addCoords=True)
22 | mol = F.kekulize(mol)
23 | return mol
24 |
25 |
26 | def sequence_tunable(
27 | mol,
28 | OP_REMOVE_ISOTOPE=True, OP_NEUTRALISE_CHARGE=True,
29 | OP_REMOVE_STEREO=False, OP_COMMUTE_INCHI=False,
30 | OP_KEEP_BIGGEST=True, OP_ADD_HYDROGEN=True,
31 | OP_KEKULIZE=True, OP_NEUTRALISE_CHARGE_LATE=True
32 | ):
33 | """Tunable sequence of filters for standardization.
34 |
35 | Operations will made in the following order:
36 | 1 RDKit Cleanup -- always
37 | 2 RDKIT SanitizeMol -- always
38 | 3 Remove isotope -- optional (default: True)
39 | 4 Neutralise charges -- optional (default: True)
40 | 5 RDKit SanitizeMol -- if 4 or 5
41 | 6 Remove stereo -- optional (default: False)
42 | 7 Commute Inchi -- if 6 or optional (default: False)
43 | 8 Keep biggest -- optional (default: True)
44 | 9 RDKit SanitizeMol -- if any (6, 7, 8)
45 | 10 Add hydrogens -- optional (default: True)
46 | 11 Kekulize -- optional (default: True)
47 | """
48 | F = Filters()
49 | # Always perform the basics..
50 | Cleanup(mol)
51 | SanitizeMol(mol, sanitizeOps=SanitizeFlags.SANITIZE_ALL, catchErrors=False)
52 | AssignStereochemistry(mol, cleanIt=True, force=True, flagPossibleStereoCenters=True) # Fix bug TD201904.01
53 | #
54 | if OP_REMOVE_ISOTOPE:
55 | mol = F.remove_isotope(mol)
56 | if OP_NEUTRALISE_CHARGE:
57 | mol = F.neutralise_charge(mol)
58 | if any([OP_REMOVE_ISOTOPE, OP_REMOVE_ISOTOPE]):
59 | SanitizeMol(mol, sanitizeOps=SanitizeFlags.SANITIZE_ALL, catchErrors=False)
60 | #
61 | if OP_REMOVE_STEREO:
62 | mol = F.remove_stereo(mol)
63 | OP_COMMUTE_INCHI = True
64 | if OP_COMMUTE_INCHI:
65 | mol = F.commute_inchi(mol)
66 | if OP_KEEP_BIGGEST:
67 | mol = F.keep_biggest(mol)
68 | if any([OP_REMOVE_STEREO, OP_COMMUTE_INCHI, OP_KEEP_BIGGEST]):
69 | SanitizeMol(mol, sanitizeOps=SanitizeFlags.SANITIZE_ALL, catchErrors=False)
70 | #
71 | if OP_NEUTRALISE_CHARGE_LATE:
72 | mol = F.neutralise_charge(mol)
73 | SanitizeMol(mol, sanitizeOps=SanitizeFlags.SANITIZE_ALL, catchErrors=False)
74 | #
75 | if OP_ADD_HYDROGEN:
76 | mol = F.add_hydrogen(mol, addCoords=True)
77 | if OP_KEKULIZE:
78 | mol = F.kekulize(mol)
79 | #
80 | return mol
81 |
--------------------------------------------------------------------------------
/utilities/chemtools/Standardizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Standardize chemicals
4 |
5 | This is basically a rework of the standardizer.py written by Baudoin Delepine
6 | at INRA.
7 |
8 | @author: Baudoin Delepine, 2016-2017
9 | @author: Thomas Duigou, 2018-2019
10 | """
11 |
12 | from utilities.chemtools import Sequences
13 | from utilities.chemtools.Filters import Filters
14 | from rdkit.Chem import SanitizeMol, SanitizeFlags
15 | from rdkit.Chem.AllChem import AssignStereochemistry
16 |
17 | class Standardizer(object):
18 | """Handle standardization of compound(s) through user-defined "filters".
19 | """
20 |
21 | def __call__(self, mol):
22 | """Calling the Standardizer class like a function is the same
23 | as calling its "compute" method.
24 |
25 | Form:
26 | https://github.com/mcs07/MolVS/blob/master/molvs/standardize.py
27 | """
28 | return self.compute(mol)
29 |
30 | def __init__(self, sequence_fun=None, params=None):
31 | """Set up parameters for the standardization
32 |
33 | :param rdmol: an RDKit Mol object
34 | """
35 | # Function to be used for standardizing compounds
36 | # Add you own function as method class
37 | if sequence_fun is None:
38 | self.sequence_fun = self.sequence_minimal
39 | elif callable(sequence_fun): # Guess: fun_filters is the function itself
40 | self.sequence_fun = sequence_fun
41 | elif type(sequence_fun) == str:
42 | self.sequence_fun = getattr(Sequences, sequence_fun) # Guess: sequence_fun is the name of the function
43 | # Arguments to be passed to any custom standardization function
44 | self._params = params if params else None
45 |
46 | def sequence_minimal(self, mol):
47 | """Minimal standardization."""
48 | SanitizeMol(mol, sanitizeOps=SanitizeFlags.SANITIZE_ALL, catchErrors=False)
49 | AssignStereochemistry(mol, cleanIt=True, force=True, flagPossibleStereoCenters=True) # Fix bug TD201904.01
50 | return mol
51 |
52 | def compute(self, mol):
53 | """Do the job."""
54 | if self._params is None:
55 | return self.sequence_fun(mol)
56 | else:
57 | return self.sequence_fun(mol, **self._params)
58 |
--------------------------------------------------------------------------------
/utilities/chemtools/Utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """
3 | Starting a new toolbox to handle chemical compounds
4 | """
5 |
6 | from rdkit.Chem import MolFromSmiles, MolFromInchi, MolToSmiles, MolToInchi, MolToInchiKey, AddHs
7 |
8 |
9 | def convert_depiction(idepic, itype='smiles', otype={'inchikey'}):
10 | """Convert chemical depiction to others type of depictions
11 |
12 | :param idepic: string depiction to be converted, str
13 | :param itype: type of depiction provided as input, str
14 | :param otype: types of depiction to be generated, {"", "", ..}
15 | :return odepic: generated depictions, {"otype1": "odepic1", ..}
16 |
17 | Usage example:
18 | - convert_depiction(idepic='CCO', otype={'inchi', 'smiles', 'inchikey'})
19 | - convert_depiction(idepic='InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3', itype='inchi', otype={'inchi', 'smiles', 'inchikey'})
20 | """
21 | # Import (if needed)
22 | if itype == 'smiles':
23 | rdmol = MolFromSmiles(idepic, sanitize=True)
24 | elif itype == 'inchi':
25 | rdmol = MolFromInchi(idepic, sanitize=True)
26 | else:
27 | raise NotImplementedError('"{}" is not a valid input type'.format(itype))
28 | if rdmol is None: # Check imprt
29 | raise Exception('Import error from depiction "{}" of type "{}"'.format(idepic, itype))
30 |
31 | # Export
32 | odepic = dict()
33 | for item in otype:
34 | if item == 'smiles':
35 | odepic[item] = MolToSmiles(rdmol) # MolToSmiles is tricky, one mays want to check the possible options..
36 | elif item == 'inchi':
37 | odepic[item] = MolToInchi(rdmol)
38 | elif item == 'inchikey':
39 | odepic[item] = MolToInchiKey(rdmol)
40 | else:
41 | raise NotImplementedError('"{}" is not a valid output type'.format(otype))
42 |
43 | return odepic
44 |
--------------------------------------------------------------------------------
/utilities/reactor/Core.py:
--------------------------------------------------------------------------------
1 | """
2 | Core code for firing rules
3 | """
4 |
5 |
6 | class RuleMatchError(Exception):
7 | """Raised when something went wrong when matching a rule."""
8 |
9 | def __init__(self, msg):
10 | self._msg = msg
11 |
12 | def __str__(self):
13 | return "RULE-MATCH-ERROR: {}".format(self._msg)
14 |
15 |
16 | class RuleFireError(Exception):
17 | """Raised when something went wrong when firing a rule."""
18 |
19 | def __init__(self, msg):
20 | self._msg = msg
21 |
22 | def __str__(self):
23 | return "RULE-FIRE-ERROR: {}".format(self._msg)
24 |
25 |
26 | class RuleBurnerCore(object):
27 | """Apply one rule on one chemical."""
28 |
29 | def __init__(self, rd_rule, rd_mol):
30 | """Apply one rule on one chemical.
31 |
32 | Notice: no standardization is made on inputed chemicals and rules.
33 |
34 | :param rd_rule: RDKit reaction object, reactio rule to apply
35 | :param rd_mol: RDKit mol object, chemical
36 | :param timeout: str, Reaction rule SMARTS
37 | """
38 | # Internal settings
39 | USE_CHIRALITY_IN_MATCH = False # Default value anyway substrucre matching
40 | # Input
41 | self._rd_rule = rd_rule
42 | self._rd_mol = rd_mol
43 |
44 | def match(self):
45 | """Check if left reaction side match the chemical.
46 |
47 | returns: bool, True if there is a match, else False
48 | """
49 | try:
50 | for reactant in self._rd_rule.GetReactants():
51 | if self._rd_mol.HasSubstructMatch(reactant, ):
52 | return True
53 | return False
54 | except Exception as e:
55 | raise RuleMatchError(e) from e
56 |
57 | def fire(self):
58 | """Fire the rule on the chemical.
59 |
60 | returns: tuple of tuple, list of results for each possible application.
61 | """
62 | try:
63 | return self._rd_rule.RunReactants((self._rd_mol,))
64 | except Exception as e:
65 | raise RuleFireError(e) from e
66 |
--------------------------------------------------------------------------------
/utilities/reactor/Utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Set of methods to handle reaction I/Os
3 | """
4 |
5 |
6 | import copy
7 | import rdkit
8 | import logging
9 |
10 | from rdkit import Chem
11 | from rdkit.Chem import MolToInchiKey
12 | from rdkit import RDLogger
13 | from utilities.chemtools.Standardizer import Standardizer
14 |
15 |
16 | RD_LOGGER = RDLogger.logger()
17 | RD_LOGGER.setLevel(RDLogger.CRITICAL) # Silent most of RDKit complains
18 |
19 |
20 | class ChemConversionError(Exception):
21 | """Raised when something went wrong during chemical conversion to RDKit mol object."""
22 |
23 | def __init__(self, msg):
24 | self._msg = msg
25 |
26 | def __str__(self):
27 | return "CHEM-CONVERSION-ERROR: {}".format(self._msg)
28 |
29 |
30 | def wild_stereo_removal(rdmol):
31 | """Wild stereo removal using back and forth Inchi depiction.
32 |
33 | :param rdmol: RDKit mol
34 | :returns rdmol_new: newly generated RDKit mol
35 | """
36 | tmp_rdmol = copy.deepcopy(rdmol)
37 | Chem.RemoveStereochemistry(tmp_rdmol)
38 | return Chem.MolFromInchi(Chem.MolToInchi(tmp_rdmol))
39 |
40 |
41 | def standardize_chemical_archive(rdmol, add_hs=True, rm_stereo=True):
42 | """Standardize a chemical using RDKit sanitize method.
43 |
44 | :param rdmol: RDKit mol object
45 | :param add_hs: append Hs, bool (default: True)
46 | :param rm_stereo: remove stereo, bool (default: True)
47 | :returns rdmol: RDKit mol object
48 | """
49 | try:
50 | Chem.SanitizeMol(rdmol)
51 | if rm_stereo: # Important: do this before adding Hs (else re-add Hs)
52 | rdmol = wild_stereo_removal(rdmol)
53 | if add_hs:
54 | rdmol = Chem.AddHs(rdmol)
55 | else:
56 | rdmol = Chem.RemoveHs(rdmol)
57 | return rdmol
58 | except Exception as e:
59 | logging.warning(e)
60 | raise e
61 |
62 |
63 | def standardize_chemical(rdmol, add_hs=True, rm_stereo=True, heavy=False):
64 | """Standardize a chemical using RDKit sanitize method.
65 |
66 | :param rdmol: RDKit mol object
67 | :param add_hs: append Hs, bool (default: True)
68 | :param rm_stereo: remove stereo, bool (default: True)
69 | :param heavy: perform custom in depth standardization (default: False)
70 | :returns rdmol: RDKit mol object
71 | """
72 | # if not rm_stereo:
73 | # logging.warning("Stereo not handled at the time being.")
74 | # raise ChemConversionError("Stereo not handled at the time being.")
75 | simple_standardisation = {
76 | 'OP_REMOVE_ISOTOPE': False,
77 | 'OP_NEUTRALISE_CHARGE': False,
78 | 'OP_REMOVE_STEREO': rm_stereo,
79 | 'OP_COMMUTE_INCHI': True,
80 | 'OP_KEEP_BIGGEST': False,
81 | 'OP_ADD_HYDROGEN': add_hs,
82 | 'OP_KEKULIZE': False,
83 | 'OP_NEUTRALISE_CHARGE_LATE': True
84 | }
85 | heavy_standardisation = {
86 | 'OP_REMOVE_ISOTOPE': True,
87 | 'OP_NEUTRALISE_CHARGE': True,
88 | 'OP_REMOVE_STEREO': rm_stereo,
89 | 'OP_COMMUTE_INCHI': True,
90 | 'OP_KEEP_BIGGEST': True,
91 | 'OP_ADD_HYDROGEN': add_hs,
92 | 'OP_KEKULIZE': False,
93 | 'OP_NEUTRALISE_CHARGE_LATE': True
94 | }
95 |
96 | try:
97 | if heavy:
98 | rdmol = Standardizer(sequence_fun='sequence_tunable', params=heavy_standardisation).compute(rdmol)
99 | logging.debug("Performing heavy standardisation for compound {}".format(MolToInchiKey(rdmol)))
100 | else:
101 | rdmol = Standardizer(sequence_fun='sequence_tunable', params=simple_standardisation).compute(rdmol)
102 | return rdmol
103 | except Exception as e:
104 | logging.warning(e)
105 | raise e
106 |
107 |
108 | def standardize_results(tuple_tuple_rdmol, add_hs=True, rm_stereo=True):
109 | """Perform sanitization and remove duplicates from reaction rule results.
110 |
111 | :param tuple_tuple_rdmol: tuple of tuple of RDKit Mol
112 | :param add_hs: append Hs, bool (default: True)
113 | :param rm_stereo: remove stereo, bool (default: True)
114 | :returns list_list_std: list of list of standardized RDKit Mol
115 | :returns list_idx_tuple_failed: list of index of tuples that failed the standardization
116 | """
117 | uniq_depics = set()
118 | list_list_std = list()
119 | list_idx_tuple_failed = list()
120 |
121 | for idx_tuple, tuple_rdmol in enumerate(tuple_tuple_rdmol):
122 | try:
123 | list_std = list()
124 | list_inchikeys = list()
125 | # Standardize
126 | for rdmol in tuple_rdmol:
127 | for rd_frag in Chem.GetMolFrags(rdmol, asMols=True, sanitizeFrags=False):
128 | list_std.append(standardize_chemical(rd_frag, add_hs=add_hs, rm_stereo=rm_stereo))
129 | # Get Inchikeys
130 | for rdmol in list_std:
131 | inchikey = Chem.MolToInchiKey(rdmol)
132 | if inchikey:
133 | list_inchikeys.append(inchikey)
134 | else:
135 | msg = 'Product conversion to InChIKey raised an empty string'
136 | logging.warning(ChemConversionError(msg))
137 | raise ChemConversionError(msg)
138 | # Get unique depiction
139 | depic = '.'.join(sorted(list_inchikeys))
140 | # Stoer only if unique depiction never met
141 | if depic not in uniq_depics:
142 | uniq_depics.add(depic)
143 | list_list_std.append(list_std)
144 | except ChemConversionError as e:
145 | logging.warning("{}".format(e))
146 | list_idx_tuple_failed.append(idx_tuple)
147 | raise e
148 | except Exception as e:
149 | logging.warning("Cannot handle a tuple of result, skipped")
150 | logging.warning("{}".format(e))
151 | list_idx_tuple_failed.append(idx_tuple)
152 |
153 | return list_list_std, list_idx_tuple_failed
154 |
155 |
156 | def handle_results(list_list_rdmol):
157 | """Generate InchiKey, Inchi and SMILES from results.
158 |
159 | :param list_list_rdmol: list of list of RDKit Mol
160 | :returns list_list_inchikeys: list of list of InchiKeys
161 | :returns list_list_inchis: list of list of Inchis
162 | :returns list_list_smiles: list of list of SMILES
163 | """
164 | list_list_inchikeys = list()
165 | list_list_inchis = list()
166 | list_list_smiles = list()
167 |
168 | for list_rdmol in list_list_rdmol:
169 | try:
170 | list_inchikeys = list()
171 | list_inchis = list()
172 | list_smiles = list()
173 | list_std = list()
174 | for rdmol in list_rdmol:
175 | # Get & check depictions
176 | inchikey = Chem.MolToInchiKey(rdmol) # DEBUG: this part could be optimized
177 | inchi = Chem.MolToInchi(rdmol)
178 | smiles = Chem.MolToSmiles(rdmol)
179 | if not all([inchikey, inchi, smiles]):
180 | raise ChemConversionError("Chemical conversion error")
181 | # Store if we reach there
182 | list_inchikeys.append(inchikey)
183 | list_inchis.append(inchi)
184 | list_smiles.append(smiles)
185 | # Store if we reach the end
186 | list_list_inchikeys.append(list_inchikeys)
187 | list_list_inchis.append(list_inchis)
188 | list_list_smiles.append(list_smiles)
189 | except ChemConversionError as e:
190 | logging.warning("{}".format(e))
191 | raise e
192 | except Exception as e:
193 | logging.warning("Cannot handle a tuple of result, skipped")
194 | logging.warning("{}".format(e))
195 | return list_list_inchikeys, list_list_inchis, list_list_smiles # Quick but dirty
196 |
--------------------------------------------------------------------------------