├── .github └── workflows │ └── install.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets └── logo │ ├── LLcheD.png │ ├── LLcheD.svg │ ├── LLcheM.png │ ├── LLcheM.svg │ ├── chemnlp.png │ └── chemnlp.svg ├── code_of_conduct.md ├── conda.yaml ├── data ├── check_pandas.py ├── check_smiles_split.py ├── kg │ ├── chebi_chebi │ │ └── meta.yaml │ ├── chembl33_preprocessed_filtered_bioactivity_dataset_w_fullprotnames_smiles │ │ └── meta.yaml │ ├── compound_chebi │ │ └── meta.yaml │ ├── compound_chebi_chebi │ │ └── meta.yaml │ ├── compound_chebi_chebi_chebi_1 │ │ └── meta.yaml │ ├── compound_chebi_chebi_chebi_2 │ │ └── meta.yaml │ ├── compound_protein │ │ └── meta.yaml │ ├── compound_protein_compound_1 │ │ └── meta.yaml │ ├── compound_protein_compound_2 │ │ └── meta.yaml │ ├── compound_protein_compound_3 │ │ └── meta.yaml │ ├── compound_protein_disease │ │ └── meta.yaml │ ├── compound_protein_domain │ │ └── meta.yaml │ ├── compound_protein_ec_number │ │ └── meta.yaml │ ├── compound_protein_go_term_1 │ │ └── meta.yaml │ ├── compound_protein_go_term_2 │ │ └── meta.yaml │ ├── compound_protein_go_term_3 │ │ └── meta.yaml │ ├── compound_protein_go_term_4 │ │ └── meta.yaml │ ├── compound_protein_hpo │ │ └── meta.yaml │ ├── compound_protein_hpo_disease_1 │ │ └── meta.yaml │ ├── compound_protein_hpo_disease_2 │ │ └── meta.yaml │ ├── compound_protein_pathway │ │ └── meta.yaml │ ├── compound_protein_pathway_disease_1 │ │ └── meta.yaml │ ├── compound_protein_pathway_disease_2 │ │ └── meta.yaml │ ├── compound_protein_pathway_disease_3 │ │ └── meta.yaml │ ├── compound_protein_protein │ │ └── meta.yaml │ ├── drug_chebi │ │ └── meta.yaml │ ├── drug_chebi_chebi │ │ └── meta.yaml │ ├── drug_chebi_chebi_chebi │ │ └── meta.yaml │ ├── drug_disease_pathway │ │ └── meta.yaml │ ├── drug_disease_pathway_protein │ │ └── meta.yaml │ ├── drug_protein │ │ └── meta.yaml │ ├── drug_protein_disease │ │ └── meta.yaml │ ├── drug_protein_domain │ │ └── meta.yaml │ ├── drug_protein_drug │ │ └── meta.yaml │ ├── drug_protein_ec_number │ │ └── meta.yaml │ ├── drug_protein_go_term │ │ └── meta.yaml │ ├── drug_protein_hpo │ │ └── meta.yaml │ ├── drug_protein_hpo_disease │ │ └── meta.yaml │ ├── drug_protein_pathway │ │ └── meta.yaml │ ├── drug_protein_pathway_disease │ │ └── meta.yaml │ └── drug_protein_protein │ │ └── meta.yaml ├── natural │ ├── preprocess_europepmc.py │ ├── preprocess_msds.py │ ├── preprocess_nougat.py │ └── preprocess_nougat.sh ├── postprocess_split.py ├── tabular │ ├── BACE │ │ ├── meta.yaml │ │ └── transform.py │ ├── BBBP │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_466 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_548 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_600 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_644 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_652 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_689 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_692 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_712 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_713 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_733 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_737 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_810 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_832 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_846 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_852 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_858 │ │ ├── meta.yaml │ │ └── transform.py │ ├── MUV_859 │ │ ├── meta.yaml │ │ └── transform.py │ ├── RedDB │ │ ├── meta.yaml │ │ └── transform.py │ ├── SIDER │ │ ├── meta.yaml │ │ └── transform.py │ ├── ames_mutagenicity │ │ ├── meta.yaml │ │ └── transform.py │ ├── aminoacids │ │ ├── meta.yaml │ │ └── transform.py │ ├── bc5chem │ │ ├── meta.yaml │ │ └── transform.py │ ├── bc5disease │ │ ├── meta.yaml │ │ └── transform.py │ ├── bicerano_dataset │ │ ├── meta.yaml │ │ └── transform.py │ ├── bio_ner │ │ ├── meta.yaml │ │ └── transform.py │ ├── bioavailability_ma_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── block_polymers_morphology │ │ ├── meta.yaml │ │ └── transform.py │ ├── blood_brain_barrier_martins_et_al │ │ └── transform.py │ ├── buchwald_hartwig │ │ ├── meta.yaml │ │ └── transform.py │ ├── caco2_wang │ │ ├── meta.yaml │ │ └── transform.py │ ├── carcinogens │ │ ├── meta.yaml │ │ └── transform.py │ ├── cav3_t-type_calcium_channels_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── chebi_20 │ │ ├── meta.yaml │ │ └── transform.py │ ├── check_pandas.py │ ├── check_smiles_split.py │ ├── chem_caption_smarts │ │ ├── meta.yaml │ │ ├── preprocess.py │ │ └── transform.py │ ├── chembl_v29 │ │ ├── meta.yaml │ │ └── transform.py │ ├── chemcaption_fragments │ │ ├── meta.yaml │ │ └── transform.py │ ├── chemcaption_rdkit │ │ ├── meta.yaml │ │ ├── preprocess.py │ │ └── transform.py │ ├── chemdner │ │ ├── meta.yaml │ │ └── transform.py │ ├── chemistry_stackexchange │ │ ├── meta.yaml │ │ └── transform.py │ ├── choline_transporter_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── clearance_astrazeneca │ │ ├── meta.yaml │ │ └── transform.py │ ├── clintox │ │ ├── meta.yaml │ │ └── transform.py │ ├── core_mof_no_topo │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp2c9_substrate_carbonmangels │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp2d6_substrate_carbonmangels │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp3a4_substrate_carbonmangels │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp_p450_1a2_inhibition_veith_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp_p450_2c19_inhibition_veith_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp_p450_2c9_inhibition_veith_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp_p450_2d6_inhibition_veith_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── cyp_p450_3a4_inhibition_veith_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── drug_induced_liver_injury │ │ ├── meta.yaml │ │ └── transform.py │ ├── drugchat_liang_zhang_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── fda_adverse_reactions │ │ ├── meta.yaml │ │ └── transform.py │ ├── flashpoint │ │ ├── meta.yaml │ │ └── transform.py │ ├── formation_energies │ │ ├── meta.yaml │ │ └── transform.py │ ├── freesolv │ │ ├── meta.yaml │ │ └── transform.py │ ├── h2_storage_materials │ │ ├── LICENSE │ │ ├── meta.yaml │ │ ├── processing.ipynb │ │ └── transform.py │ ├── half_life_obach │ │ ├── meta.yaml │ │ └── transform.py │ ├── herg_blockers │ │ ├── meta.yaml │ │ └── transform.py │ ├── herg_central_at_10uM │ │ ├── meta.yaml │ │ └── transform.py │ ├── herg_central_at_1uM │ │ ├── meta.yaml │ │ └── transform.py │ ├── herg_central_inhib │ │ ├── meta.yaml │ │ └── transform.py │ ├── herg_karim_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── hiv │ │ ├── meta.yaml │ │ └── transform.py │ ├── human_intestinal_absorption │ │ ├── meta.yaml │ │ └── transform.py │ ├── inverse_1 │ │ ├── meta.yaml │ │ └── transform.py │ ├── inverse_2 │ │ ├── meta.yaml │ │ └── transform.py │ ├── inverse_3 │ │ ├── meta.yaml │ │ └── transform.py │ ├── iupac_goldbook │ │ ├── meta.yaml │ │ └── transform.py │ ├── iupac_smiles │ │ ├── meta.yaml │ │ └── transform.py │ ├── kcnq2_potassium_channel_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── ld50_catmos │ │ ├── meta.yaml │ │ └── transform.py │ ├── ld50_zhu │ │ ├── example_processing_and_templates.ipynb │ │ ├── meta.yaml │ │ └── transform.py │ ├── lipophilicity │ │ ├── data_original.txt │ │ ├── meta.yaml │ │ └── transform.py │ ├── m1_muscarinic_receptor_agonists_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── m1_muscarinic_receptor_antagonists_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── mattermodeling_stackexchange │ │ ├── meta.yaml │ │ └── transform.py │ ├── melting_points │ │ ├── meta.yaml │ │ └── transform.py │ ├── merge.py │ ├── mofdscribe │ │ ├── meta.yaml │ │ └── transform.py │ ├── mol2svg │ │ ├── meta.yaml │ │ └── transform.py │ ├── mol_repr_transl │ │ └── transform.py │ ├── mona │ │ ├── example_processing_and_templates.ipynb │ │ ├── meta.yaml │ │ └── transform.py │ ├── moses │ │ ├── meta.yaml │ │ └── transform.py │ ├── mp_anisotropy │ │ ├── meta.yaml │ │ └── transform.py │ ├── mp_bulk_modulus │ │ ├── meta.yaml │ │ └── transform.py │ ├── mp_descriptions │ │ ├── meta.yaml │ │ └── transform.py │ ├── mp_self_supervised │ │ ├── meta.yaml │ │ ├── prepare_data.py │ │ └── transform.py │ ├── mp_shear_modulus │ │ ├── meta.yaml │ │ └── transform.py │ ├── ncbi_disease │ │ ├── meta.yaml │ │ └── transform.py │ ├── nlmchem │ │ ├── meta.yaml │ │ └── transform.py │ ├── nomad_structure │ │ ├── meta.yaml │ │ └── transform.py │ ├── nr_ahr_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── nr_ar_lbd_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── nr_ar_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── nr_aromatase_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── nr_er_lbd_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── nr_er_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── nr_ppar_gamma_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── ocp │ │ ├── meta.yaml │ │ └── transform.py │ ├── odd_one_out │ │ ├── meta.yaml │ │ └── transform.py │ ├── opv │ │ ├── meta.yaml │ │ └── transform.py │ ├── oqmd │ │ ├── meta.yaml │ │ └── transform.py │ ├── orbnet_denali │ │ ├── develop_transform.ipynb │ │ ├── meta.yaml │ │ └── transform.py │ ├── ord_masked │ │ ├── meta.yaml │ │ └── transform.py │ ├── ord_predictions │ │ ├── meta.yaml │ │ └── transform.py │ ├── ord_procedure_steps │ │ ├── meta.yaml │ │ └── transform.py │ ├── ord_rxn_smiles_procedure │ │ ├── meta.yaml │ │ └── transform.py │ ├── ord_rxn_smiles_yield_pred │ │ ├── meta.yaml │ │ └── transform.py │ ├── ord_steps_yield │ │ ├── meta.yaml │ │ └── transform.py │ ├── orexin1_receptor_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── p_glycoprotein_inhibition_broccatelli_et_al │ │ ├── meta.yaml │ │ └── transform.py │ ├── pampa_ncats │ │ ├── example_processing_and_templates.ipynb │ │ ├── meta.yaml │ │ └── transform.py │ ├── peptides_hemolytic │ │ ├── meta.yaml │ │ └── transform.py │ ├── peptides_nonfouling │ │ ├── meta.yaml │ │ └── transform.py │ ├── peptides_soluble │ │ ├── meta.yaml │ │ └── transform.py │ ├── perovskite_db │ │ ├── meta.yaml │ │ └── transform.py │ ├── physics_stackexchange │ │ ├── explore.ipynb │ │ ├── meta.yaml │ │ └── transform.py │ ├── potassium_ion_channel_kir2_1_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── qm8 │ │ ├── meta.yaml │ │ └── transform.py │ ├── qm9 │ │ ├── meta.yaml │ │ ├── prep_csv.py │ │ └── transform.py │ ├── qmof_gcmc │ │ ├── meta.yaml │ │ └── transform.py │ ├── qmof_quantum │ │ ├── meta.yaml │ │ └── transform.py │ ├── rdkit_features │ │ ├── meta.yaml │ │ └── transform.py │ ├── rhea_db_masked │ │ ├── meta.yaml │ │ └── transform.py │ ├── rhea_db_predictions │ │ ├── meta.yaml │ │ └── transform.py │ ├── run_all_transform.sh │ ├── sarscov2_3clpro_diamond │ │ ├── meta.yaml │ │ └── transform.py │ ├── sarscov2_vitro_touret │ │ ├── meta.yaml │ │ └── transform.py │ ├── serine_threonine_kinase_33_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── sigma_aldrich_safety_data │ │ ├── meta.yaml │ │ └── transform.py │ ├── skin_reaction │ │ ├── meta.yaml │ │ └── transform.py │ ├── smiles_to_3d │ │ ├── meta.yaml │ │ └── transform.py │ ├── solubility_aqsoldb │ │ ├── meta.yaml │ │ └── transform.py │ ├── sr_are_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── sr_atad5_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── sr_hse_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── sr_mmp_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── sr_p53_tox21 │ │ ├── meta.yaml │ │ └── transform.py │ ├── suzuki_miyaura_sach │ │ ├── meta.yaml │ │ └── transform.py │ ├── thermosol │ │ ├── meta.yaml │ │ └── transform.py │ ├── train_test_split.py │ ├── tyrosyl-dna_phosphodiesterase_butkiewicz │ │ ├── meta.yaml │ │ └── transform.py │ ├── uniprot_binding_single │ │ ├── meta.yaml │ │ └── transform.py │ ├── uniprot_binding_sites_multiple │ │ ├── meta.yaml │ │ └── transform.py │ ├── uniprot_organisms │ │ ├── meta.yaml │ │ └── transform.py │ ├── uniprot_reactions │ │ ├── meta.yaml │ │ └── transform.py │ ├── uniprot_sentences │ │ ├── meta.yaml │ │ └── transform.py │ ├── uspto │ │ ├── meta.yaml │ │ └── transform.py │ ├── uspto_yield │ │ ├── meta.yaml │ │ └── transform.py │ ├── volume_of_distribution_at_steady_state_lombardo_et_al │ │ ├── meta.yaml │ │ └── transform.py │ └── zinc │ │ ├── meta.yaml │ │ └── transform.py ├── text_sampling │ ├── extend_tabular.py │ ├── extend_tabular_processed.py │ ├── get_dataset_overlap.py │ ├── preprocess_kg.py │ ├── text_sampling.py │ └── utils.py └── train_test_split.py ├── docs ├── CONTRIBUTING.md ├── api │ ├── meta_yaml_augmentor.md │ ├── meta_yaml_generator.md │ ├── sampler.md │ └── sampler_cli.md └── index.md ├── experiments ├── README.md ├── ablations │ ├── 20240814_sample_data.bash │ └── continued_pretrain.py ├── configs │ ├── data_configs │ │ ├── data_mixing.yml │ │ ├── hf_data.yml │ │ ├── hf_data_wiki.yml │ │ ├── prep_lm_eval_data.yml │ │ └── prep_smiles_data.yml │ ├── deepspeed │ │ ├── deepspeed_S1.json │ │ ├── deepspeed_S2.json │ │ ├── deepspeed_offload_S2.json │ │ └── deepspeed_offload_S3.json │ ├── eval_configs │ │ ├── default_eval_config.yaml │ │ ├── nlp_eval_config.yaml │ │ ├── safety_eval_config.yaml │ │ └── stem_eval_config.yaml │ ├── gpt-neox │ │ ├── 160M.yml │ │ ├── cluster_setup.yml │ │ └── soft_prompt.yml │ └── hugging-face │ │ ├── 160M_full.yml │ │ ├── 160M_ptune.yml │ │ ├── 1B_fine_tune.yml │ │ ├── 3B_fine_tune.yml │ │ ├── 410M_fine_tune.yml │ │ └── 7B_fine_tune.yml ├── data │ ├── merge_epmc_to_jsonl.py │ ├── prepare_gptneox_chemrxiv.py │ ├── prepare_hf_dataset.py │ ├── prepare_lm_eval_dataset.py │ ├── prepare_mixed_data.py │ ├── prepare_smiles_dataset.py │ ├── prepare_xyz_denali_data.py │ ├── sbatch_hf_dataset.sh │ ├── sbatch_hf_split.sh │ ├── sbatch_merge_epmc_jsonl.sh │ └── split_data.py ├── scripts │ ├── env_creation_hf.sh │ ├── env_creation_neox.sh │ ├── eval_create_batch_configs.py │ ├── miniconda_install.sh │ ├── run_eval.sh │ ├── run_eval_batch.sh │ ├── run_grid_search.py │ ├── run_n_shot_benchmarks_eval.py │ ├── run_tune.py │ ├── sbatch_train_hf.sh │ ├── sbatch_train_hf_multinode.sh │ ├── sbatch_train_neox.sh │ ├── transfer_all_checkpoint_to_s3.sh │ ├── transfer_checkpoint_to_s3.sh │ └── transfer_hf_cache.sh └── working │ └── calculate_nll.py ├── mkdocs.yml ├── pyproject.toml ├── src └── chemnlp │ ├── __init__.py │ ├── data │ ├── constants.py │ ├── convert.py │ ├── hf_datasets.py │ ├── meta.yaml │ ├── meta_yaml_augmentor.py │ ├── meta_yaml_generator.py │ ├── ner.py │ ├── random_variable.py │ ├── reprs.py │ ├── sampler.py │ ├── sampler_cli.py │ ├── split.py │ └── utils.py │ ├── data_val │ ├── __init__.py │ ├── config.py │ ├── model.py │ └── validate.py │ └── utils.py └── tests ├── __init__.py ├── data ├── __init__.py ├── test_sampler.py └── test_sampler_cli.py ├── test_ner.py └── test_reprs.py /.github/workflows/install.yaml: -------------------------------------------------------------------------------- 1 | # GitHub action that attempts to install the conda env 2 | # from conda.yaml 3 | # then run black, isort, flake8 4 | name: Install 5 | on: [push, pull_request] 6 | jobs: 7 | install: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/checkout@v2 11 | - uses: conda-incubator/setup-miniconda@v2 12 | with: 13 | environment-file: conda.yaml 14 | activate-environment: chemnlp 15 | python-version: 3.9 16 | auto-update-conda: true 17 | auto-activate-base: false 18 | - name: Validate yaml 19 | shell: bash -l {0} 20 | run: | 21 | conda activate chemnlp 22 | python -m src.chemnlp.data_val.validate data 23 | - name: Tests 24 | shell: bash -l {0} 25 | run: | 26 | pip install pytest 27 | pytest tests 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 OpenBioML 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/logo/LLcheD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/assets/logo/LLcheD.png -------------------------------------------------------------------------------- /assets/logo/LLcheM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/assets/logo/LLcheM.png -------------------------------------------------------------------------------- /assets/logo/chemnlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/assets/logo/chemnlp.png -------------------------------------------------------------------------------- /conda.yaml: -------------------------------------------------------------------------------- 1 | name: dummy 2 | dependencies: 3 | - python==3.9.* 4 | - pip 5 | - pip: 6 | - . 7 | - .[dev] 8 | - .[dataset_creation] 9 | -------------------------------------------------------------------------------- /data/kg/chebi_chebi/meta.yaml: -------------------------------------------------------------------------------- 1 | name: chebi_chebi 2 | description: Knowledgegraph data samples. 3 | targets: 4 | - id: node2_type 5 | description: node2_type 6 | type: Other 7 | units: node2_type 8 | names: 9 | - noun: node2_type 10 | - id: node2_name 11 | description: node2_name 12 | type: Other 13 | units: node2_name 14 | names: 15 | - noun: node2_name 16 | - id: node2_id 17 | description: node2_id 18 | type: Other 19 | units: node2_id 20 | names: 21 | - noun: node2_id 22 | identifiers: 23 | - id: node1_type 24 | description: node1_type 25 | type: Other 26 | - id: node1_name 27 | description: node1_name 28 | type: Other 29 | - id: node1_id 30 | description: node1_id 31 | type: Other 32 | - id: rel1_type 33 | description: rel1_type 34 | type: Other 35 | license: CC BY 4.0 36 | links: 37 | - url: https://crossbar.kansil.org 38 | description: original knowledge graph web GUI link 39 | num_points: 638182 40 | bibtex: 41 | - "@article{10.1093/nar/gkab543,\nauthor = {Doğan, Tunca and Atas, Heval and Joshi, Vishal and Atakan, Ahmet and Rifaioglu, Ahmet Sureyya and Nalbat, Esra and Nightingale, Andrew and Saidi, Rabie and Volynkin, Vladimir and Zellner, Hermann and Cetin-Atalay, Rengul and Martin, Maria and Atalay, Volkan},\ntitle = \"{CROssBAR: comprehensive resource of biomedical relations with knowledge graph representations}\",\njournal = {Nucleic Acids Research},\nvolume = {49},\nnumber = {16},\npages = {e96-e96},\nyear = {2021},\nmonth = {06},\nissn = {0305-1048},\ndoi = {10.1093/nar/gkab543},\nurl = {https://doi.org/10.1093/nar/gkab543},\n}" 42 | templates: 43 | - The {node1_name#} {rel1_type#} {node2_name#}. 44 | -------------------------------------------------------------------------------- /data/kg/compound_chebi/meta.yaml: -------------------------------------------------------------------------------- 1 | name: compound_chebi 2 | description: Knowledgegraph data samples. 3 | targets: 4 | - id: node1_type 5 | description: node1_type 6 | type: Other 7 | units: node1_type 8 | names: 9 | - noun: node1_type 10 | - id: node1_name 11 | description: node1_name 12 | type: Other 13 | units: node1_name 14 | names: 15 | - noun: node1_name 16 | - id: node1_id 17 | description: node1_id 18 | type: Other 19 | units: node1_id 20 | names: 21 | - noun: node1_id 22 | - id: rel1_type 23 | description: rel1_type 24 | type: Other 25 | units: rel1_type 26 | names: 27 | - noun: rel1_type 28 | - id: node2_type 29 | description: node2_type 30 | type: Other 31 | units: node2_type 32 | names: 33 | - noun: node2_type 34 | - id: node2_name 35 | description: node2_name 36 | type: Other 37 | units: node2_name 38 | names: 39 | - noun: node2_name 40 | - id: node2_id 41 | description: node2_id 42 | type: Other 43 | units: node2_id 44 | names: 45 | - noun: node2_id 46 | identifiers: 47 | - id: SMILES 48 | description: SMILES 49 | type: SMILES 50 | license: CC BY 4.0 51 | links: 52 | - url: https://crossbar.kansil.org 53 | description: original knowledge graph web GUI link 54 | num_points: 6754 55 | bibtex: 56 | - "@article{10.1093/nar/gkab543,\nauthor = {Doğan, Tunca and Atas, Heval and Joshi, Vishal and Atakan, Ahmet and Rifaioglu, Ahmet Sureyya and Nalbat, Esra and Nightingale, Andrew and Saidi, Rabie and Volynkin, Vladimir and Zellner, Hermann and Cetin-Atalay, Rengul and Martin, Maria and Atalay, Volkan},\ntitle = \"{CROssBAR: comprehensive resource of biomedical relations with knowledge graph representations}\",\njournal = {Nucleic Acids Research},\nvolume = {49},\nnumber = {16},\npages = {e96-e96},\nyear = {2021},\nmonth = {06},\nissn = {0305-1048},\ndoi = {10.1093/nar/gkab543},\nurl = {https://doi.org/10.1093/nar/gkab543},\n}" 57 | templates: 58 | - The {node1_type#} {SMILES__description} {SMILES#} {rel1_type#} {node2_name#}. 59 | - |- 60 | Task: Please {#create|generate!} {#a compound |a !}{SMILES__description} that {rel1_type#} {node2_name#}. 61 | Result: {SMILES#} 62 | - |- 63 | Task: Please {#create|generate!} {#a compound |a !}{SMILES__description} that {rel1_type#} {node2_name#}. 64 | Result: {SMILES#} 65 | -------------------------------------------------------------------------------- /data/kg/drug_chebi/meta.yaml: -------------------------------------------------------------------------------- 1 | name: drug_chebi 2 | description: Knowledgegraph data samples. 3 | targets: 4 | - id: node1_type 5 | description: node1_type 6 | type: Other 7 | units: node1_type 8 | names: 9 | - noun: node1_type 10 | - id: node1_name 11 | description: node1_name 12 | type: Other 13 | units: node1_name 14 | names: 15 | - noun: node1_name 16 | - id: node1_id 17 | description: node1_id 18 | type: Other 19 | units: node1_id 20 | names: 21 | - noun: node1_id 22 | - id: rel1_type 23 | description: rel1_type 24 | type: Other 25 | units: rel1_type 26 | names: 27 | - noun: rel1_type 28 | - id: node2_type 29 | description: node2_type 30 | type: Other 31 | units: node2_type 32 | names: 33 | - noun: node2_type 34 | - id: node2_name 35 | description: node2_name 36 | type: Other 37 | units: node2_name 38 | names: 39 | - noun: node2_name 40 | - id: node2_id 41 | description: node2_id 42 | type: Other 43 | units: node2_id 44 | names: 45 | - noun: node2_id 46 | identifiers: 47 | - id: SMILES 48 | description: SMILES 49 | type: SMILES 50 | license: CC BY 4.0 51 | links: 52 | - url: https://crossbar.kansil.org 53 | description: original knowledge graph web GUI link 54 | num_points: 3033 55 | bibtex: 56 | - "@article{10.1093/nar/gkab543,\nauthor = {Doğan, Tunca and Atas, Heval and Joshi, Vishal and Atakan, Ahmet and Rifaioglu, Ahmet Sureyya and Nalbat, Esra and Nightingale, Andrew and Saidi, Rabie and Volynkin, Vladimir and Zellner, Hermann and Cetin-Atalay, Rengul and Martin, Maria and Atalay, Volkan},\ntitle = \"{CROssBAR: comprehensive resource of biomedical relations with knowledge graph representations}\",\njournal = {Nucleic Acids Research},\nvolume = {49},\nnumber = {16},\npages = {e96-e96},\nyear = {2021},\nmonth = {06},\nissn = {0305-1048},\ndoi = {10.1093/nar/gkab543},\nurl = {https://doi.org/10.1093/nar/gkab543},\n}" 57 | templates: 58 | - The {node1_type#} {SMILES#} {rel1_type#} {node2_name#}. 59 | -------------------------------------------------------------------------------- /data/natural/preprocess_msds.py: -------------------------------------------------------------------------------- 1 | """This script parses MSDS data parsed from Sigma Aldrich 2 | (https://huggingface.co/datasets/chemNLP/MSDS/tree/main) and flattens it. 3 | 4 | You need to change filepaths before running this script 5 | """ 6 | 7 | import json 8 | import os 9 | 10 | 11 | def get_text(d, text="", level=1, linebreaks=2): 12 | for k in d: 13 | if k in [ 14 | "SECTION 6: Accidental release measures", # always empty 15 | "SECTION 1: Toxicological information", # always empty 16 | "SECTION 16: Other information", # always the same information 17 | ]: 18 | continue 19 | 20 | text += "#" * level + " " + k + "\n" * linebreaks 21 | 22 | if isinstance(d[k], str): 23 | if d[k] != "": 24 | text += d[k].rstrip() + "\n" * linebreaks 25 | elif isinstance(d[k], dict): 26 | text = get_text(d[k], text=text, level=level + 1) 27 | return text 28 | 29 | 30 | if __name__ == "__main__": 31 | path_jsonl_in = "/fsx/proj-chemnlp/micpie/chemnlp/data/natural/msds/msds.jsonl" 32 | 33 | # load 34 | with open(path_jsonl_in) as f: 35 | data = [json.loads(line) for line in f] 36 | 37 | # process 38 | data = list(map(get_text, data)) 39 | data = [{"text": x} for x in data] 40 | 41 | # save 42 | path_jsonl_out = path_jsonl_in.replace(".jsonl", "_clean.jsonl") 43 | if os.path.isfile(path_jsonl_out): 44 | print(f"Output file already exists, please check: {path_jsonl_out}") 45 | else: 46 | with open(path_jsonl_out, "a") as fout: 47 | for sample in data: 48 | fout.write(json.dumps(sample) + "\n") 49 | print(f"JSONL saved to: {path_jsonl_out}") 50 | -------------------------------------------------------------------------------- /data/natural/preprocess_nougat.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=preprocess_nougat 3 | #SBATCH --output=/fsx/proj-chemnlp/micpie/chemnlp/data/natural/%x_%j.out 4 | #SBATCH --account chemnlp 5 | #SBATCH --comment chemnlp 6 | #SBATCH --partition=cpu16 7 | #SBATCH --ntasks-per-node=1 8 | #SBATCH --cpus-per-task=1 9 | 10 | cd /fsx/proj-chemnlp/micpie/chemnlp/data/natural/ 11 | 12 | ## ensure we can use activate syntax in slurm scripts 13 | export CONDA_ENV_PATH=/admin/home-micpie/miniconda3/envs/chemnlp 14 | CONDA_BASE=$(conda info --base) 15 | source $CONDA_BASE/etc/profile.d/conda.sh 16 | conda activate ${CONDA_ENV_PATH} 17 | 18 | python --version 19 | 20 | python preprocess_nougat.py 21 | 22 | #DATE=$(date -d "today" +"%Y%m%d%H%M") 23 | #echo $DATE 24 | 25 | #mv /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv.jsonl /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv_$DATE.jsonl 26 | #tar -cvzf /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv_$DATE.jsonl.tar.gz /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv_$DATE.jsonl 27 | 28 | #mv /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv.jsonl /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv_$DATE.jsonl 29 | #tar -cvzf /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv_$DATE.jsonl.tar.gz /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv_$DATE.jsonl 30 | 31 | #mv /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv.jsonl /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv_$DATE.jsonl 32 | #tar -cvzf /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv_$DATE.jsonl.tar.gz /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv_$DATE.jsonl 33 | -------------------------------------------------------------------------------- /data/tabular/BACE/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def load_dataset() -> pd.DataFrame: 5 | bace = pd.read_csv( 6 | "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv" 7 | ) 8 | return bace 9 | 10 | 11 | def transform_data(): 12 | bace = load_dataset() 13 | bace = bace.rename(columns={"mol": "SMILES", "Class": "BACE_inhibition"}) 14 | bace = bace.drop(columns=["CID", "Model", "canvasUID"]) 15 | 16 | # Keeping only qualitative and quantitative pIC50 values 17 | # Removing all the RDKit computed descriptors 18 | cols_to_keep = ["SMILES", "pIC50", "BACE_inhibition"] 19 | bace = bace[cols_to_keep] 20 | bace.to_csv("data_clean.csv", index=False) 21 | 22 | 23 | if __name__ == "__main__": 24 | transform_data() 25 | -------------------------------------------------------------------------------- /data/tabular/BBBP/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def load_dataset(): 5 | BBBP = pd.read_csv( 6 | "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv" 7 | ) 8 | return BBBP 9 | 10 | 11 | def transform_data(): 12 | BBBP = load_dataset() 13 | 14 | cols_to_keep = [ 15 | "smiles", 16 | "p_np", 17 | ] 18 | 19 | BBBP = BBBP[cols_to_keep] 20 | BBBP = BBBP.rename(columns={"smiles": "SMILES"}) 21 | BBBP.to_csv("data_clean.csv", index=False) 22 | 23 | 24 | if __name__ == "__main__": 25 | transform_data() 26 | -------------------------------------------------------------------------------- /data/tabular/MUV_466/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_466 2 | description: Activity in the MUV_466 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-466 9 | type: boolean 10 | description: MUV-466 11 | names: 12 | - noun: an agonist of the S1P1 receptor 13 | license: CC BY 4.0 14 | links: 15 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 16 | description: corresponding publication 17 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 18 | description: Data source 19 | num_points: 14841 20 | bibtex: 21 | - | 22 | @article{doi:10.1021/ci8002649, 23 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 24 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 25 | journal = {Journal of Chemical Information and Modeling}, 26 | volume = {49}, 27 | number = {2}, 28 | pages = {169-184}, 29 | year = {2009}, 30 | doi = {10.1021/ci8002649}, 31 | URL = {https://doi.org/10.1021/ci8002649}} 32 | templates: 33 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-466#not &NULL}{MUV-466__names__noun}. 34 | -------------------------------------------------------------------------------- /data/tabular/MUV_466/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_466/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_548/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_548 2 | description: Activity in the MUV_548 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-548 9 | type: boolean 10 | description: MUV-548 11 | names: 12 | - noun: an inhibitor of the protein kinase A (PKA) 13 | - noun: an inhibitor of the protein kinase A 14 | - noun: an inhibitor of PKA 15 | license: CC BY 4.0 16 | links: 17 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 18 | description: corresponding publication 19 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 20 | description: Data source 21 | num_points: 14734 22 | bibtex: 23 | - | 24 | @article{doi:10.1021/ci8002649, 25 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 26 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 27 | journal = {Journal of Chemical Information and Modeling}, 28 | volume = {49}, 29 | number = {2}, 30 | pages = {169-184}, 31 | year = {2009}, 32 | doi = {10.1021/ci8002649}, 33 | URL = {https://doi.org/10.1021/ci8002649}} 34 | templates: 35 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-548#not &NULL}{MUV-548__names__noun}. 36 | -------------------------------------------------------------------------------- /data/tabular/MUV_548/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_548/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_600/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_600 2 | description: Activity in the MUV_600 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-600 9 | type: boolean 10 | description: MUV-600 11 | names: 12 | - noun: an inhibitor of the steroidogenic factor 1 (SF-1) 13 | - noun: an inhibitor of SF-1 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14728 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-600#not &NULL}{MUV-600__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_600/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_600/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_644/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_644 2 | description: Activity in the MUV_644 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-644 9 | type: boolean 10 | description: MUV-644 11 | names: 12 | - noun: an inhibitor of Rho-kinase 2 (ROCK-2) 13 | - noun: an inhibitor of ROCK-2 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14623 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-644#not &NULL}{MUV-644__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_644/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_644/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_652/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_652 2 | description: Activity in the MUV_652 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-652 9 | type: boolean 10 | description: MUV-652 11 | names: 12 | - noun: an inhibitor of HIV RT-RNase 13 | license: CC BY 4.0 14 | links: 15 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 16 | description: corresponding publication 17 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 18 | description: Data source 19 | num_points: 14902 20 | bibtex: 21 | - | 22 | @article{doi:10.1021/ci8002649, 23 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 24 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 25 | journal = {Journal of Chemical Information and Modeling}, 26 | volume = {49}, 27 | number = {2}, 28 | pages = {169-184}, 29 | year = {2009}, 30 | doi = {10.1021/ci8002649}, 31 | URL = {https://doi.org/10.1021/ci8002649}} 32 | templates: 33 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-652#not &NULL}{MUV-652__names__noun}. 34 | -------------------------------------------------------------------------------- /data/tabular/MUV_652/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_652/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_689/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_689 2 | description: Activity in the MUV_689 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-689 9 | type: boolean 10 | description: MUV-689 11 | names: 12 | - noun: an inhibitor of the EPH receptor A4 13 | license: CC BY 4.0 14 | links: 15 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 16 | description: corresponding publication 17 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 18 | description: Data source 19 | num_points: 14601 20 | bibtex: 21 | - | 22 | @article{doi:10.1021/ci8002649, 23 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 24 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 25 | journal = {Journal of Chemical Information and Modeling}, 26 | volume = {49}, 27 | number = {2}, 28 | pages = {169-184}, 29 | year = {2009}, 30 | doi = {10.1021/ci8002649}, 31 | URL = {https://doi.org/10.1021/ci8002649}} 32 | templates: 33 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-689#not &NULL}{MUV-689__names__noun}. 34 | -------------------------------------------------------------------------------- /data/tabular/MUV_689/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_689/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_692/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_692 2 | description: Activity in the MUV_692 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-692 9 | type: boolean 10 | description: MUV-692 11 | names: 12 | - noun: an agonist of the steroidogenic factor 1 (SF-1) 13 | - noun: an agonist of SF-1 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14644 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-692#not &NULL}{MUV-692__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_692/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_692/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_712/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_712 2 | description: Activity in the MUV_712 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-712 9 | type: boolean 10 | description: MUV-712 11 | names: 12 | - noun: an inhibitor of the heat shock protein 90 13 | - noun: an inhibitor of HSP90 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14411 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-712#not &NULL}{MUV-712__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_712/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_712/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_713/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_713 2 | description: Activity in the MUV_713 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-713 9 | type: boolean 10 | description: MUV-713 11 | names: 12 | - noun: an inhibitor of the estrogen receptor-alpha-coactivator binding 13 | - noun: an inhibitor of the ER-alpha-coact. binding 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14836 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-713#not &NULL}{MUV-713__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_713/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_713/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_733/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_733 2 | description: Activity in the MUV_733 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-733 9 | type: boolean 10 | description: MUV-733 11 | names: 12 | - noun: an inhibitor of the estrogen receptor-alpha-coactivator binding 13 | license: CC BY 4.0 14 | links: 15 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 16 | description: corresponding publication 17 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 18 | description: Data source 19 | num_points: 14682 20 | bibtex: 21 | - | 22 | @article{doi:10.1021/ci8002649, 23 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 24 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 25 | journal = {Journal of Chemical Information and Modeling}, 26 | volume = {49}, 27 | number = {2}, 28 | pages = {169-184}, 29 | year = {2009}, 30 | doi = {10.1021/ci8002649}, 31 | URL = {https://doi.org/10.1021/ci8002649}} 32 | templates: 33 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-733#not &NULL}{MUV-733__names__noun}. 34 | -------------------------------------------------------------------------------- /data/tabular/MUV_733/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_733/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_737/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_737 2 | description: Activity in the MUV_737 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-737 9 | type: boolean 10 | description: MUV-737 11 | names: 12 | - noun: a potentiator of the estrogen receptor-alpha-coactivator binding 13 | - noun: a potentiator of the ER-alpha-coact. binding 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14691 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-737#not &NULL}{MUV-737__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_737/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_737/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_810/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_810 2 | description: Activity in the MUV_810 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-810 9 | type: boolean 10 | description: MUV-810 11 | names: 12 | - noun: an inhibitor of the focal adhesion kinase 13 | - noun: an inhibitor of FAK 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14644 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-810#not &NULL}{MUV-810__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_810/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_810/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_832/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_832 2 | description: Activity in the MUV_832 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-832 9 | type: boolean 10 | description: MUV-832 11 | names: 12 | - noun: an inhibitor of the Cathepsin G protease 13 | license: CC BY 4.0 14 | links: 15 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 16 | description: corresponding publication 17 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 18 | description: Data source 19 | num_points: 14667 20 | bibtex: 21 | - | 22 | @article{doi:10.1021/ci8002649, 23 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 24 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 25 | journal = {Journal of Chemical Information and Modeling}, 26 | volume = {49}, 27 | number = {2}, 28 | pages = {169-184}, 29 | year = {2009}, 30 | doi = {10.1021/ci8002649}, 31 | URL = {https://doi.org/10.1021/ci8002649}} 32 | templates: 33 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-832#not &NULL}{MUV-832__names__noun}. 34 | -------------------------------------------------------------------------------- /data/tabular/MUV_832/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_832/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_846/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_846 2 | description: Activity in the MUV_846 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-846 9 | type: boolean 10 | description: MUV-846 11 | names: 12 | - noun: an inhibitor of factor XIa (FXIa) 13 | license: CC BY 4.0 14 | links: 15 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 16 | description: corresponding publication 17 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 18 | description: Data source 19 | num_points: 14711 20 | bibtex: 21 | - | 22 | @article{doi:10.1021/ci8002649, 23 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 24 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 25 | journal = {Journal of Chemical Information and Modeling}, 26 | volume = {49}, 27 | number = {2}, 28 | pages = {169-184}, 29 | year = {2009}, 30 | doi = {10.1021/ci8002649}, 31 | URL = {https://doi.org/10.1021/ci8002649}} 32 | templates: 33 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-846#not &NULL}{MUV-846__names__noun}. 34 | - |- 35 | Question: Is the {SMILES__description} {SMILES#} {MUV-846__names__noun}? 36 | 37 | Answer:{MUV-846#no&yes} 38 | - |- 39 | Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-846__names__noun}. 40 | 41 | Result:{MUV-846#no&yes} 42 | - |- 43 | Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-846#not &NULL}{MUV-846__names__noun} and report its {SMILES__description}. 44 | 45 | Result:{SMILES#} 46 | - |- 47 | {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-846#not &NULL}{MUV-846__names__noun} and report its {SMILES__description}. 48 | 49 | Result: {SMILES#} 50 | -------------------------------------------------------------------------------- /data/tabular/MUV_846/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_846/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_852/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_852 2 | description: Activity in the MUV_852 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-852 9 | type: boolean 10 | description: MUV-852 11 | names: 12 | - noun: an inhibitor of factor XIIa (FXIIa) 13 | license: CC BY 4.0 14 | links: 15 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 16 | description: corresponding publication 17 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 18 | description: Data source 19 | num_points: 14651 20 | bibtex: 21 | - | 22 | @article{doi:10.1021/ci8002649, 23 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 24 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 25 | journal = {Journal of Chemical Information and Modeling}, 26 | volume = {49}, 27 | number = {2}, 28 | pages = {169-184}, 29 | year = {2009}, 30 | doi = {10.1021/ci8002649}, 31 | URL = {https://doi.org/10.1021/ci8002649}} 32 | templates: 33 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-852#not &NULL}{MUV-852__names__noun}. 34 | -------------------------------------------------------------------------------- /data/tabular/MUV_852/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_852/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_858/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_858 2 | description: Activity in the MUV_858 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-858 9 | type: boolean 10 | description: MUV-858 11 | names: 12 | - noun: an allosteric modulator of the dopamine receptor D1 13 | - noun: an allosteric modulator of the D1 receptor 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14774 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-858#not &NULL}{MUV-858__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_858/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_858/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/MUV_859/meta.yaml: -------------------------------------------------------------------------------- 1 | name: MUV_859 2 | description: Activity in the MUV_859 assay 3 | identifiers: 4 | - id: SMILES 5 | type: SMILES 6 | description: SMILES 7 | targets: 8 | - id: MUV-859 9 | type: boolean 10 | description: MUV-859 11 | names: 12 | - noun: an allosteric inhibitor of the muscarinic acetylcholine receptor M1 13 | - noun: an allosteric inhibitor of the M1 receptor 14 | license: CC BY 4.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false 17 | description: corresponding publication 18 | - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz 19 | description: Data source 20 | num_points: 14746 21 | bibtex: 22 | - | 23 | @article{doi:10.1021/ci8002649, 24 | author = {Rohrer, Sebastian G. and Baumann, Knut}, 25 | title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data}, 26 | journal = {Journal of Chemical Information and Modeling}, 27 | volume = {49}, 28 | number = {2}, 29 | pages = {169-184}, 30 | year = {2009}, 31 | doi = {10.1021/ci8002649}, 32 | URL = {https://doi.org/10.1021/ci8002649}} 33 | templates: 34 | - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-859#not &NULL}{MUV-859__names__noun}. 35 | -------------------------------------------------------------------------------- /data/tabular/MUV_859/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/MUV", filename="MUV_859/data_clean.csv", repo_type="dataset" 8 | ) 9 | df = pd.read_csv(file) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | transform_data() 15 | -------------------------------------------------------------------------------- /data/tabular/aminoacids/meta.yaml: -------------------------------------------------------------------------------- 1 | name: aminoacids 2 | description: |- 3 | The list of the 20 essential aminoacids, their SMILES, one letter and three letter codes. 4 | targets: 5 | - id: three_letter_code 6 | description: three-letter code 7 | type: text 8 | - id: one_letter_code 9 | description: one-letter code 10 | type: text 11 | - id: aminoacid_name 12 | description: name 13 | type: text 14 | - id: type 15 | description: type of aminoacid 16 | type: text 17 | identifiers: 18 | - id: SMILES 19 | type: SMILES 20 | description: SMILES 21 | license: CC BY 4.0 22 | links: 23 | - url: https://chemistry.stackexchange.com/questions/138614/why-are-tyrosine-and-tryptophan-considered-hydrophobic 24 | description: reference for amino acid type 25 | num_points: 20 26 | templates: 27 | - The {#essential amino acid|amino acid|amino acid (AA)|AA!} with the {SMILES__description} {SMILES#} has the one-letter code {one_letter_code#} and the three-letter code {three_letter_code#}. 28 | - The {#essential amino acid|amino acid|amino acid (AA)|AA!} {aminoacid_name#} has the one-letter code {one_letter_code#} and the three-letter code {three_letter_code#}. 29 | - |- 30 | Question: What is the one-letter code of the {#essential amino acid|amino acid|amino acid (AA)|AA!} with the {SMILES__description} {SMILES#}? 31 | Answer: {one_letter_code#}. 32 | - |- 33 | Question: What is the three-letter code of the {#essential amino acid|amino acid|amino acid (AA)|AA!} with the {SMILES__description} {SMILES#}? 34 | Answer: {three_letter_code#}. 35 | - |- 36 | Question: What is the type of the amino acid with the one-letter code {one_letter_code#} and {SMILES__description} {SMILES#}? 37 | Constraint: The possible types are: polar, non-polar, positively charged, negatively charged. 38 | Answer: From the provided amino acid types (polar, non-polar, positively charged, negatively charged), the amino acid with the one-letter code {one_letter_code#} is {type#}. 39 | -------------------------------------------------------------------------------- /data/tabular/aminoacids/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def extract_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/uniprot", filename="aminoacid_seq.csv", repo_type="dataset" 8 | ) 9 | aminoacids = pd.read_csv(file) 10 | aminoacids.to_csv("data_clean.csv", index=False) 11 | return aminoacids 12 | 13 | 14 | if __name__ == "__main__": 15 | extract_data() 16 | -------------------------------------------------------------------------------- /data/tabular/bc5chem/meta.yaml: -------------------------------------------------------------------------------- 1 | name: bc5chem 2 | description: |- 3 | BC5CHEM is a named entity recognition dataset for chemical mentions. 4 | targets: 5 | - id: matched_words 6 | description: matched words 7 | type: text 8 | names: 9 | - noun: entity 10 | - noun: matched entity 11 | identifiers: 12 | - id: sentence 13 | description: Sentence 14 | type: text 15 | names: 16 | - noun: sentence 17 | - noun: text 18 | license: https://huggingface.co/datasets/bigbio/blurb/blob/main/LICENSE 19 | links: 20 | - url: https://huggingface.co/datasets/bigbio/blurb 21 | description: original dataset 22 | benchmarks: 23 | - name: bc5chem 24 | link: hhttps://huggingface.co/datasets/bigbio/blurb 25 | split_column: split 26 | num_points: 13755 27 | bibtex: 28 | - |- 29 | @article{gu2021domain, 30 | title = { 31 | Domain-specific language model pretraining for biomedical natural 32 | language processing 33 | }, 34 | author = { 35 | Gu, Yu and Tinn, Robert and Cheng, Hao and Lucas, Michael and 36 | Usuyama, Naoto and Liu, Xiaodong and Naumann, Tristan and Gao, 37 | Jianfeng and Poon, Hoifung 38 | }, 39 | year = 2021, 40 | journal = {ACM Transactions on Computing for Healthcare (HEALTH)}, 41 | publisher = {ACM New York, NY}, 42 | volume = 3, 43 | number = 1, 44 | pages = {1--23} 45 | } 46 | templates: 47 | - |- 48 | Task: Find all the mentions of {#chemicals|chemical compounds|chemical substances!} in the {#following|subsequent!} {#text|sentence!}. Return the matching {#words|entities!}. If there is no {#match|mention of a chemical|matching entity!}, return `no match`. 49 | {#Sentence|Description!}: {sentence#} 50 | Answer: {matched_words#} 51 | - |- 52 | User: Does the following text contain mentions of {#chemicals|chemical compounds|chemical substances!}?{# Can you return matches?| Can you output matches?| Please return matches.!} 53 | {#Text: |!}{sentence#} 54 | Assistant: {#I found|There is!} {matched_words#}. 55 | -------------------------------------------------------------------------------- /data/tabular/bc5chem/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datasets import load_dataset 3 | 4 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner 5 | from chemnlp.data.utils import oxford_comma_join 6 | 7 | 8 | def process(): 9 | # tokenized at whitespaces and punctuations 10 | dataset = load_dataset("bigbio/blurb", "bc5chem") 11 | dfs = [] 12 | for split in ["train", "validation", "test"]: 13 | df_ = dataset[split].to_pandas() 14 | df_["split"] = split 15 | dfs.append(df_) 16 | df = pd.concat(dfs) 17 | ner_labels = df["ner_tags"] 18 | 19 | matched_words = [] 20 | for tokens, ner_label in zip(df["tokens"], ner_labels): 21 | words = group_tokens_by_labels(tokens, ner_label) 22 | if len(words) == 0: 23 | matched_words.append("no match") 24 | else: 25 | matched_words.append(oxford_comma_join(words)) 26 | 27 | df["matched_words"] = matched_words 28 | df["sentence"] = df["tokens"].apply(punctuation_joiner) 29 | 30 | df = df[["sentence", "matched_words"]] 31 | 32 | # ensure we have at least 5 words in a sentence 33 | df = df[df["sentence"].apply(lambda x: len(x.split()) >= 5)] 34 | 35 | print(len(df)) 36 | df.to_csv("data_clean.csv", index=False) 37 | 38 | 39 | if __name__ == "__main__": 40 | process() 41 | -------------------------------------------------------------------------------- /data/tabular/bc5disease/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datasets import load_dataset 3 | 4 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner 5 | from chemnlp.data.utils import oxford_comma_join 6 | 7 | 8 | def process(): 9 | # tokenized at whitespaces and punctuations 10 | dataset = load_dataset("bigbio/blurb", "bc5disease") 11 | dfs = [] 12 | for split in ["train", "validation", "test"]: 13 | df_ = dataset[split].to_pandas() 14 | df_["split"] = split 15 | dfs.append(df_) 16 | df = pd.concat(dfs) 17 | ner_labels = df["ner_tags"] 18 | 19 | matched_words = [] 20 | for tokens, ner_label in zip(df["tokens"], ner_labels): 21 | words = group_tokens_by_labels(tokens, ner_label) 22 | if len(words) == 0: 23 | matched_words.append("no match") 24 | else: 25 | matched_words.append(oxford_comma_join(words)) 26 | 27 | df["matched_words"] = matched_words 28 | df["sentence"] = df["tokens"].apply(punctuation_joiner) 29 | 30 | df = df[["sentence", "matched_words"]] 31 | 32 | # ensure we have at least 5 words in a sentence 33 | df = df[df["sentence"].apply(lambda x: len(x.split()) >= 5)] 34 | 35 | print(len(df)) 36 | df.to_csv("data_clean.csv", index=False) 37 | 38 | 39 | if __name__ == "__main__": 40 | process() 41 | -------------------------------------------------------------------------------- /data/tabular/bicerano_dataset/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from canonicalize_psmiles.canonicalize import canonicalize 3 | from huggingface_hub import hf_hub_download 4 | 5 | 6 | def transform_data(): 7 | file = hf_hub_download( 8 | repo_id="chemNLP/bicerano_polymers", 9 | filename="HT_MD_polymer_properties.csv", 10 | repo_type="dataset", 11 | ) 12 | original_data = pd.read_csv(file) 13 | clean_data = original_data.drop("sl_num", axis=1) 14 | 15 | assert not clean_data.duplicated().sum() 16 | 17 | clean_columns = [ 18 | "compound_name", 19 | "PSMILES", 20 | "Tg_exp", 21 | "Tg_calc", 22 | "Tg_calc_std", 23 | "rho_300K_exp", 24 | "rho_300K_calc", 25 | "rho_300K_calc_std", 26 | "glass_CTE_calc", 27 | "glass_CTE_calc_std", 28 | "rubber_CTE_calc", 29 | "rubber_CTE_calc_std", 30 | ] 31 | 32 | clean_data.columns = clean_columns 33 | 34 | clean_data["compound_name"] = clean_data["compound_name"].str.strip() 35 | 36 | clean_data["PSMILES"] = clean_data["PSMILES"].str.replace( 37 | "[Ce]", "[*]", regex=False 38 | ) 39 | clean_data["PSMILES"] = clean_data["PSMILES"].str.replace( 40 | "[Th]", "[*]", regex=False 41 | ) 42 | clean_data["PSMILES"] = clean_data["PSMILES"].str.replace( 43 | "[[*]]", "[*]", regex=False 44 | ) 45 | 46 | clean_data["PSMILES"] = clean_data["PSMILES"].apply( 47 | lambda smiles: canonicalize(smiles) 48 | ) 49 | 50 | clean_data.to_csv("data_clean.csv") 51 | 52 | 53 | if __name__ == "__main__": 54 | transform_data() 55 | -------------------------------------------------------------------------------- /data/tabular/bio_ner/meta.yaml: -------------------------------------------------------------------------------- 1 | name: bio_ner 2 | description: NER task on bio-related text. 3 | identifiers: 4 | - id: Sentence 5 | description: Sentence 6 | type: Other 7 | targets: 8 | - id: entity_1 9 | description: entity_1 10 | type: Other 11 | units: entity_1 12 | names: 13 | - noun: entity_1 14 | - id: json 15 | description: json 16 | type: Other 17 | units: 18 | names: 19 | - noun: JSON output 20 | benchmarks: 21 | - name: bio_ner 22 | link: https://github.com/ML4LitS/bio-datasets 23 | split_column: split 24 | license: unknown 25 | links: 26 | - url: https://github.com/ML4LitS/bio-datasets 27 | description: ??? 28 | num_points: 123509 29 | bibtex: 30 | - ??? 31 | templates: 32 | - |- 33 | Task: Please carry out the {#named entity recognition (NER)|named entity recognition|NER!} task for the the text below. 34 | Text: {Sentence#}. 35 | Constrain: Please, {#only |!}list the entities in the form NER entity, span start, span end, and type {#in separate lines |!}with a high probability of being in the text. 36 | Result: {entity_1#} 37 | -------------------------------------------------------------------------------- /data/tabular/block_polymers_morphology/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | columns_to_keep = ["phase1", "T", "BigSMILES", "Mn", "f1", "Mw", "D"] 4 | 5 | 6 | def process(): 7 | df = pd.read_csv( 8 | "https://raw.githubusercontent.com/olsenlabmit/BCDB/main/data/diblock.csv" 9 | ) 10 | df = df[df["phase2"].isna()] # remove multiple phases 11 | mw_clean = [] 12 | dispersity_clean = [] 13 | 14 | for mw, dispersity in zip(df["Mw"], df["D"]): 15 | # if nan, make empty string 16 | # else, add the units 17 | if pd.isna(mw) or "nan" in str(mw): 18 | mw_clean.append("REPLACENULL") 19 | else: 20 | mw_clean.append(f", average molecular mass of {mw:.1f} g/mol") 21 | 22 | if pd.isna(dispersity) or "nan" in str(dispersity): 23 | # empty character that will still appear in the csv 24 | dispersity_clean.append("REPLACENULL") 25 | else: 26 | dispersity_clean.append(f", and dispersity of {dispersity:.1f}") 27 | 28 | df["Mw"] = mw_clean 29 | df["D"] = dispersity_clean 30 | df.dropna(subset=columns_to_keep, inplace=True) 31 | print(len(df)) 32 | df[columns_to_keep].to_csv("data_clean.csv", index=False) 33 | 34 | 35 | if __name__ == "__main__": 36 | process() 37 | -------------------------------------------------------------------------------- /data/tabular/caco2_wang/meta.yaml: -------------------------------------------------------------------------------- 1 | name: caco2_wang 2 | description: |- 3 | The human colon epithelial cancer cell line, Caco-2, 4 | is used as an in vitro model to simulate the human intestinal tissue. 5 | The experimental result on the rate of drug passing through 6 | the Caco-2 cells can approximate the rate at which the drug permeates 7 | through the human intestinal tissue. 8 | targets: 9 | - id: permeability 10 | description: Caco-2 cell effective permeability. 11 | units: cm/s 12 | type: continuous 13 | names: 14 | - noun: Caco-2 cell effective permeability 15 | - noun: Caco-2 cell permeability 16 | - noun: Caco-2 permeability 17 | pubchem_aids: 18 | - 678378 19 | uris: 20 | - http://www.bioassayontology.org/bao#BAO_0010008 21 | - http://purl.obolibrary.org/obo/MI_2162 22 | benchmarks: 23 | - name: TDC 24 | link: https://tdcommons.ai/ 25 | split_column: split 26 | identifiers: 27 | - id: SMILES 28 | type: SMILES 29 | description: SMILES 30 | - id: compound_name 31 | type: Other 32 | description: compound name 33 | names: 34 | - noun: compound 35 | - noun: compound name 36 | license: CC BY 4.0 37 | links: 38 | - url: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al 39 | description: original data set link 40 | - url: https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642 41 | description: corresponding publication 42 | num_points: 910 43 | bibtex: 44 | - |- 45 | @article{wang2016adme, 46 | title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability 47 | using a combination of NSGA-II and boosting}, 48 | author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao, 49 | Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng}, 50 | journal={Journal of Chemical Information and Modeling}, 51 | volume={56}, 52 | number={4}, 53 | pages={763--773}, 54 | year={2016}, 55 | publisher={ACS Publications} 56 | } 57 | -------------------------------------------------------------------------------- /data/tabular/chem_caption_smarts/meta.yaml: -------------------------------------------------------------------------------- 1 | name: chem_caption_smarts 2 | description: |- 3 | This dataset contains the count of substructures in molecules 4 | targets: 5 | - id: smarts 6 | type: text 7 | description: substructure smarts 8 | names: 9 | - noun: SMARTS 10 | - noun: SMiles ARbitrary Target Specification (SMARTS) 11 | - id: completion 12 | type: categorical 13 | description: number of matches 14 | - id: completion_labels 15 | type: text 16 | description: name of the substructure 17 | identifiers: 18 | - id: representation 19 | type: text 20 | description: representation 21 | - id: representation_type 22 | type: text 23 | description: representation type 24 | license: CC BY 4.0 25 | links: 26 | - url: https://github.com/lamalab-org/chem-caption 27 | description: Original codebase used to generate this dataset 28 | templates: 29 | - |- 30 | Question: {#How many times|How often!} does the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contain the substructure with the {smarts__names__noun} {#smarts#}? 31 | Answer: {completion#} 32 | - |- 33 | Question: {#How many times|How often!} does the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contain a {completion#} substructure? 34 | Answer: {smarts__names__noun} {#smarts#} 35 | - |- 36 | User: {#I want to|I have to|I must|I would like to!} know {#how many times|how often!} the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#}. 37 | Assistant: The {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times. 38 | - |- 39 | User: {#I want to|I have to|I must|I would like to!} know how many times the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains a {completion#} substructure. 40 | Assistant: The {#molecule|chemical|compound|chemical structure!} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times. 41 | -------------------------------------------------------------------------------- /data/tabular/chem_caption_smarts/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def process(): 5 | # get the smarts config 6 | df = pd.read_parquet( 7 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/smarts/train-00000-of-00001-71cef18c6383b463.parquet" # noqa 8 | ) 9 | df["completion_labels"] = df["completion_labels"].astype(str) 10 | df["completion_labels"] = df["completion_labels"].str.replace( 11 | "_count", "", regex=True 12 | ) 13 | df.to_csv("data_clean.csv", index=False) 14 | 15 | 16 | if __name__ == "__main__": 17 | process() 18 | -------------------------------------------------------------------------------- /data/tabular/chembl_v29/meta.yaml: -------------------------------------------------------------------------------- 1 | name: chembl_v29 2 | description: |- 3 | ChEMBL is a manually curated database of bioactive molecules with drug-like properties. 4 | It brings together chemical, bioactivity and genomic data 5 | to aid the translation of genomic information into effective new drugs. 6 | benchmarks: 7 | - name: TDC 8 | link: https://tdcommons.ai/ 9 | split_column: split 10 | identifiers: 11 | - id: SMILES 12 | type: SMILES 13 | description: SMILES 14 | license: CC BY-SA 3.0 15 | links: 16 | - url: https://academic.oup.com/nar/article/47/D1/D930/5162468 17 | description: Article about original dataset 18 | - url: https://academic.oup.com/nar/article/43/W1/W612/2467881 19 | description: Exemplary related article shown in tdc's website 20 | num_points: 2084637 21 | bibtex: 22 | - |- 23 | @article{10.1093/nar/gky1075, 24 | author = {Mendez, David and Gaulton, Anna and Bento, A Patricia and Chambers, Jon and De Veij, 25 | Marleen and Felix, Eloy and Magarinos, Maria Paula and Mosquera, 26 | Juan F and Mutowo, Prudence and Nowotka, Michal and Gordillo-Maranon, 27 | Maria and Hunter, Fiona and Junco, Laura and Mugumbate, Grace and Rodriguez-Lopez, Milagros and Atkinson, 28 | Francis and Bosc, Nicolas and Radoux, Chris J and Segura-Cabrera, Aldo and Hersey, Anne and Leach, Andrew R}, 29 | title = {ChEMBL: towards direct deposition of bioassay data}, 30 | journal = {Nucleic Acids Research}, 31 | volume = {47}, 32 | number = {D1}, 33 | pages = {D930-D940}, 34 | year = {2018}, 35 | month = {11}, 36 | abstract = "{ChEMBL is a large, open-access bioactivity database 37 | (https://www.ebi.ac.uk/chembl), previously described in the 2012, 38 | 2014 and 2017 Nucleic Acids Research Database Issues. 39 | In the last two years, several important improvements have been made to the database and are described here. 40 | These include more robust capture and representation of assay details; 41 | a new data deposition system, allowing updating of data sets and deposition of supplementary data; 42 | and a completely redesigned web interface, with enhanced search and filtering capabilities.}", 43 | issn = {0305-1048}, 44 | doi = {10.1093/nar/gky1075}, 45 | url = {https://doi.org/10.1093/nar/gky1075}, 46 | eprint = {https://academic.oup.com/nar/article-pdf/47/D1/D930/27437436/gky1075.pdf}, 47 | } 48 | -------------------------------------------------------------------------------- /data/tabular/chemcaption_fragments/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def process(): 5 | df = pd.read_parquet( 6 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/smarts/train-00000-of-00001-71cef18c6383b463.parquet?download=true" # noqa 7 | ) 8 | df.dropna(inplace=True) 9 | print(len(df)) 10 | df["fragment"] = df["completion_labels"].str.replace("_count", "") 11 | df["presence"] = df["completion"] > 0 12 | df["molecule"] = df["representation"] 13 | df.to_csv("data_clean.csv", index=False) 14 | 15 | 16 | if __name__ == "__main__": 17 | process() 18 | -------------------------------------------------------------------------------- /data/tabular/chemcaption_rdkit/transform.py: -------------------------------------------------------------------------------- 1 | import fire 2 | import pandas as pd 3 | 4 | 5 | def process(): 6 | df = pd.read_parquet( 7 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/rdkit_feat/train-00000-of-00001-7cea16ab26bf74cf.parquet?download=true" # noqa 8 | ) 9 | df["num_bonds_simple"] = df[ 10 | [ 11 | "num_single_bonds", 12 | "num_double_bonds", 13 | "num_triple_bonds", 14 | "num_quadruple_bonds", 15 | "num_quintuple_bonds", 16 | "num_aromatic_bonds", 17 | ] 18 | ].sum(axis=1) 19 | 20 | df = df[df["num_bonds_simple"].astype(int) == df["num_bonds"].astype(int)] 21 | 22 | df[ 23 | [ 24 | "num_valence_electrons", 25 | "num_single_bonds", 26 | "num_double_bonds", 27 | "num_triple_bonds", 28 | "num_quadruple_bonds", 29 | "num_quintuple_bonds", 30 | "num_aromatic_bonds", 31 | "num_bonds", 32 | "num_carbon_atoms", 33 | "num_hydrogen_atoms", 34 | "num_nitrogen_atoms", 35 | "num_oxygen_atoms", 36 | "num_hydrogen_bond_acceptors", 37 | "num_hydrogen_bond_donors", 38 | "num_lipinski_violations", 39 | "num_chiral_centers", 40 | ] 41 | ] = df[ 42 | [ 43 | "num_valence_electrons", 44 | "num_single_bonds", 45 | "num_double_bonds", 46 | "num_triple_bonds", 47 | "num_quadruple_bonds", 48 | "num_quintuple_bonds", 49 | "num_aromatic_bonds", 50 | "num_bonds", 51 | "num_carbon_atoms", 52 | "num_hydrogen_atoms", 53 | "num_nitrogen_atoms", 54 | "num_oxygen_atoms", 55 | "num_hydrogen_bond_acceptors", 56 | "num_hydrogen_bond_donors", 57 | "num_lipinski_violations", 58 | "num_chiral_centers", 59 | ] 60 | ].astype( 61 | int 62 | ) 63 | print(len(df)) 64 | df.to_csv("data_clean.csv", index=False) 65 | 66 | 67 | if __name__ == "__main__": 68 | fire.Fire(process) 69 | -------------------------------------------------------------------------------- /data/tabular/chemdner/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | from chemnlp.data.utils import oxford_comma_join 4 | 5 | 6 | def process(): 7 | dataset = load_dataset("kjappelbaum/chemnlp-chemdner") 8 | df = dataset["train"].to_pandas() 9 | 10 | matched_words = [] 11 | for ent in df["entities"]: 12 | if len(ent) == 0: 13 | matched_words.append("no match") 14 | else: 15 | matched_words.append(oxford_comma_join(ent)) 16 | 17 | df["matched_words"] = matched_words 18 | df["sentence"] = df["text"] 19 | 20 | print(len(df)) 21 | 22 | df = df[["sentence", "matched_words"]] 23 | df.to_csv("data_clean.csv", index=False) 24 | 25 | 26 | if __name__ == "__main__": 27 | process() 28 | -------------------------------------------------------------------------------- /data/tabular/chemistry_stackexchange/meta.yaml: -------------------------------------------------------------------------------- 1 | name: chemistry_stackexchange 2 | description: |- 3 | Questions and answers mined from chemistry.stackexchange.com. 4 | targets: 5 | - id: a 6 | description: answer to the question 7 | type: string 8 | - id: title 9 | description: title of the question 10 | type: string 11 | identifiers: 12 | - id: q 13 | type: string 14 | description: question asked on chemistry.stackexchange.com 15 | license: CC BY-SA 16 | links: 17 | - url: chemistry.stackexchange.com 18 | description: original data source 19 | - url: https://stackoverflow.com/help/licensing 20 | description: information about the license 21 | num_points: 4582 22 | templates: 23 | - |- 24 | {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!} 25 | {#User: |Question: |Inquiry: |\n!}{#q} 26 | {#Assistant: |Answer: !}{#a} 27 | - |- 28 | {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!} 29 | {#Question: |Inquiry: |\n!}{#q} 30 | {#Assistant: |Title: |Answer: |!}{#title} 31 | -------------------------------------------------------------------------------- /data/tabular/core_mof_no_topo/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def process(): 6 | file = hf_hub_download( 7 | repo_id="kjappelbaum/chemnlp-core-mof", 8 | filename="core_mofid.json", 9 | repo_type="dataset", 10 | ) 11 | df = pd.read_json(file) 12 | df = df.query("is_longer_than_allowed==False").dropna( 13 | subset=[ 14 | "outputs.pure_CO2_kH", 15 | "outputs.pure_CO2_widomHOA", 16 | "outputs.pure_methane_kH", 17 | "outputs.pure_methane_widomHOA", 18 | "outputs.pure_uptake_CO2_298.00_15000", 19 | "outputs.pure_uptake_CO2_298.00_1600000", 20 | "outputs.pure_uptake_methane_298.00_580000", 21 | "outputs.pure_uptake_methane_298.00_6500000", 22 | "outputs.logKH_CO2", 23 | "outputs.logKH_CH4", 24 | "outputs.CH4DC", 25 | "outputs.CH4HPSTP", 26 | "outputs.CH4LPSTP", 27 | "smiles_linkers", 28 | "smiles_nodes", 29 | ] 30 | ) 31 | 32 | print(len(df)) 33 | 34 | df["smiles_linkers"] = df["smiles_linkers"].apply(lambda x: ", ".join(x)) 35 | df["smiles_nodes"] = df["smiles_nodes"].apply(lambda x: ", ".join(x)) 36 | 37 | df[ 38 | [ 39 | "outputs.pure_CO2_kH", 40 | "outputs.pure_CO2_widomHOA", 41 | "outputs.pure_methane_kH", 42 | "outputs.pure_methane_widomHOA", 43 | "outputs.pure_uptake_CO2_298.00_15000", 44 | "outputs.pure_uptake_CO2_298.00_1600000", 45 | "outputs.pure_uptake_methane_298.00_580000", 46 | "outputs.pure_uptake_methane_298.00_6500000", 47 | "outputs.logKH_CO2", 48 | "outputs.logKH_CH4", 49 | "outputs.CH4DC", 50 | "outputs.CH4HPSTP", 51 | "outputs.CH4LPSTP", 52 | "smiles_linkers", 53 | "smiles_nodes", 54 | "cif", 55 | ] 56 | ].to_csv("data_clean.csv", index=False) 57 | 58 | 59 | if __name__ == "__main__": 60 | process() 61 | -------------------------------------------------------------------------------- /data/tabular/drugchat_liang_zhang_et_al/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import concatenate_datasets, load_dataset 2 | 3 | PUBCHEM_DATASET = "alxfgh/PubChem_Drug_Instruction_Tuning" 4 | CHEMBL_DATASET = "alxfgh/ChEMBL_Drug_Instruction_Tuning" 5 | 6 | 7 | if __name__ == "__main__": 8 | # Load the two datasets 9 | dataset1 = load_dataset(PUBCHEM_DATASET) 10 | dataset2 = load_dataset(CHEMBL_DATASET) 11 | 12 | # Verify that the datasets have the same schema (i.e., the same fields) 13 | assert ( 14 | dataset1["train"].features == dataset2["train"].features 15 | ), "Datasets do not have the same schema" 16 | 17 | # Concatenate the 'train' split of dataset2 to the 'train' split of dataset1 18 | combined_dataset = concatenate_datasets([dataset1["train"], dataset2["train"]]) 19 | 20 | # Define the fractions for train/test/valid split 21 | train_fraction = 0.8 22 | test_fraction = 0.1 23 | # The remaining part will be the validation fraction 24 | 25 | # Generate the train/test/valid splits 26 | train_test_valid_datasets = combined_dataset.train_test_split( 27 | test_size=test_fraction, shuffle=True 28 | ) 29 | train_valid_datasets = train_test_valid_datasets["train"].train_test_split( 30 | test_size=(1 - train_fraction) / (1 - test_fraction), shuffle=True 31 | ) 32 | 33 | final_datasets = { 34 | "train": train_valid_datasets["train"], 35 | "test": train_test_valid_datasets["test"], 36 | "valid": train_valid_datasets["test"], 37 | } 38 | 39 | # Add the 'split' column to each dataset 40 | for split in final_datasets: 41 | final_datasets[split] = final_datasets[split].add_column( 42 | "split", [split] * len(final_datasets[split]) 43 | ) 44 | 45 | # Concatenate all splits again 46 | all_datasets = concatenate_datasets( 47 | [final_datasets[split] for split in final_datasets] 48 | ) 49 | df = all_datasets.to_pandas() 50 | 51 | df.rename(columns={"Answer": "answ", "Question": "quest"}, inplace=True) 52 | 53 | # Save the combined dataset as a CSV file 54 | df.to_csv("data_clean.csv", index=False) 55 | -------------------------------------------------------------------------------- /data/tabular/fda_adverse_reactions/meta.yaml: -------------------------------------------------------------------------------- 1 | name: fda_adverse_reactions 2 | description: A dataset of adverse reaction statistics for drugs and reaction events. 3 | targets: 4 | - id: count 5 | description: A count of how many reaction events occurred for this chembl id. 6 | units: 7 | type: ordinal 8 | names: 9 | - noun: adverse reaction frequency 10 | pubchem_aids: [] 11 | uris: [] 12 | - id: event 13 | description: The type of event that occurred for this molecule interaction. 14 | units: 15 | type: string 16 | names: 17 | - noun: adverse event reaction 18 | pubchem_aids: [] 19 | uris: [] 20 | identifiers: 21 | - id: SMILES 22 | type: SMILES 23 | description: This is the SMILES identifier for a given molecule. 24 | license: CC BY-SA 3.0 25 | links: 26 | - name: Dataset 27 | url: https://platform.opentargets.org/downloads 28 | description: The website which we download the dataset from during the transformation script. 29 | benchmarks: [] 30 | num_points: 94910 31 | bibtex: [] 32 | -------------------------------------------------------------------------------- /data/tabular/flashpoint/meta.yaml: -------------------------------------------------------------------------------- 1 | name: flashpoint 2 | description: | 3 | Curation of experimentally determined flash point values measured with open cup and closed cup methods. 4 | The values are from academic papers, the Hazardous Chemicals Handbook, and the PubChem chemical database. 5 | Differences from the stated sources in the paper are: 6 | * Values from the DIPPR database are not included in their dataset as they are proprietary. 7 | * There are appear to be no values from Lange's handbook of chemistry in their dataset. 8 | * We did our own processing to resolve duplicate SMILES. 9 | targets: 10 | - id: flashpoint 11 | description: Experimental flash point value (K) 12 | units: K 13 | type: continuous 14 | names: 15 | - noun: flash point 16 | uris: 17 | - http://semanticscience.org/resource/CHEMINF_000417 18 | identifiers: 19 | - id: SMILES 20 | type: SMILES 21 | description: SMILES 22 | license: CC BY 4.0 23 | num_points: 9878 # downloaded dataset has 14696 datapoints, but there are duplicate smiles 24 | links: 25 | - url: https://figshare.com/ndownloader/files/18509711 26 | description: Original figshare dataset 27 | bibtex: 28 | - | 29 | "@article{sun2020assessing, 30 | title={Assessing Graph-based Deep Learning Models for Predicting Flash Point}, 31 | author={Sun, Xiaoyu and Krakauer, Nathaniel J and Politowicz, Alexander and Chen, Wei-Ting and Li, Qiying and Li, Zuoyi and Shao, Xianjia and Sunaryo, Alfred and Shen, Mingren and Wang, James and others}, 32 | journal={Molecular informatics}, 33 | volume={39}, 34 | number={6}, 35 | pages={e1900101}, 36 | year={2020} 37 | }" 38 | -------------------------------------------------------------------------------- /data/tabular/formation_energies/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def process(): 5 | df = pd.read_json( 6 | "https://raw.githubusercontent.com/CJBartel/TestStabilityML/master/mlstabilitytest/mp_data/data/hullout.json" 7 | ) 8 | 9 | df = df.T.reset_index().rename(columns={"index": "composition"}) 10 | df["rxn"] = df["rxn"].str.replace("_", " ") 11 | df.dropna(subset=["rxn", "Ef", "Ed"], inplace=True) 12 | df["Ef"] = df["Ef"].astype(float).round(3) 13 | df["Ed"] = df["Ed"].astype(float).round(3) 14 | print(len(df)) 15 | df.to_csv("data_clean.csv", index=False) 16 | 17 | 18 | if __name__ == "__main__": 19 | process() 20 | -------------------------------------------------------------------------------- /data/tabular/h2_storage_materials/meta.yaml: -------------------------------------------------------------------------------- 1 | name: h2_storage_reversible_hydrides 2 | description: synthetic procedures, experimental and theoretical h2 capacities of hydrides 3 | targets: 4 | - id: h_weight_density_theory 5 | description: theoretical hydrogen storage capacity 6 | units: wt% 7 | type: continuous 8 | names: 9 | - noun: theoretical hydrogen storage weight density 10 | - id: h_weight_density_experiment 11 | description: experimental hydrogen storage capacity 12 | units: wt% 13 | type: continuous 14 | names: 15 | - noun: experimental hydrogen storage capacity 16 | identifiers: 17 | - id: material_name 18 | type: IUPAC 19 | description: chemical name 20 | - id: chemical_formula 21 | type: COMPOSITION 22 | names: 23 | - noun: chemical formula 24 | description: chemical formula 25 | - id: synthetic_information 26 | names: 27 | - noun: synthesis procedure summary 28 | description: brief description of synthetic procedure 29 | type: Other 30 | license: File 31 | links: 32 | - url: https://datahub.hymarc.org/dataset/hydrogen-storage-materials-db/resource/4ef1c494-366e-43a3-bed4-a3985de5c374 33 | description: website with source data 34 | - url: https://datahub.hymarc.org/dataset/ad580d95-e7e2-4ef4-a7f6-3b2f91a96eba/resource/4ef1c494-366e-43a3-bed4-a3985de5c374/download/hydstormatdb-reversible_hydrides.csv 35 | description: original_dataset 36 | num_points: 30 37 | bibtex: 38 | - "@online{hymarcReversibleHydrides,\ntitle={Hydrogen Storage Materials Database Reversible Hydrides},\nauthor={HyMARC},\nyear={2019}" 39 | -------------------------------------------------------------------------------- /data/tabular/half_life_obach/meta.yaml: -------------------------------------------------------------------------------- 1 | name: half_life_obach 2 | description: |- 3 | Half life of a drug is the duration for the concentration of the drug 4 | in the body to be reduced by half. It measures the duration of actions of a drug. 5 | This dataset deposited version under CHEMBL assay 1614674. 6 | targets: 7 | - id: half_life_duration 8 | description: the time it takes for the plasma concentration of a drug in the body to be reduced by half 9 | units: hours 10 | type: continuous 11 | significant_digits: 2 12 | names: 13 | - noun: half life in humans after IV administration 14 | - noun: half life time in humans after IV administration 15 | - noun: drug half life time in humans after IV administration 16 | uris: 17 | - http://purl.bioontology.org/ontology/MESH/D006207 18 | benchmarks: 19 | - name: TDC 20 | link: https://tdcommons.ai/ 21 | split_column: split 22 | identifiers: 23 | - id: SMILES 24 | type: SMILES 25 | description: SMILES 26 | - id: chembl_id 27 | type: Other 28 | names: 29 | - noun: ChEMBL database id 30 | - noun: ChEMBL identifier number 31 | description: ChEMBL ids 32 | sample: false 33 | license: CC BY 4.0 34 | links: 35 | - url: https://doi.org/10.1124/dmd.108.020479 36 | description: corresponding publication 37 | - url: https://tdcommons.ai/single_pred_tasks/adme/#half-life-obach-et-al 38 | description: data source 39 | num_points: 667 40 | bibtex: 41 | - |- 42 | @article{Obach2008, 43 | doi = {10.1124/dmd.108.020479}, 44 | url = {https://doi.org/10.1124/dmd.108.020479}, 45 | year = {2008}, 46 | month = apr, 47 | publisher = {American Society for Pharmacology and Experimental Therapeutics (ASPET)}, 48 | volume = {36}, 49 | number = {7}, 50 | pages = {1385--1405}, 51 | author = {R. Scott Obach and Franco Lombardo and Nigel J. Waters}, 52 | title = {Trend Analysis of a Database of Intravenous Pharmacokinetic 53 | Parameters in Humans for 670 Drug Compounds}, 54 | journal = {Drug Metabolism and Disposition} 55 | -------------------------------------------------------------------------------- /data/tabular/herg_central_at_10uM/meta.yaml: -------------------------------------------------------------------------------- 1 | name: herg_central_at_10uM 2 | description: "Human ether-à-go-go related gene (hERG) is crucial for the coordination\nof the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe\nadverse effects. Therefore, reliable prediction of hERG liability in the early\nstages of drug design is quite important to reduce the risk of cardiotoxicity-related\nattritions in the later development stages. There are three targets: hERG_at_1microM,\nhERG_at_10microM, and herg_inhib." 3 | targets: 4 | - id: herg_central_at_10uM 5 | description: the percent inhibition of hERG at a 10uM concentration 6 | units: "%" 7 | type: continuous 8 | names: 9 | - noun: hERG inhibition at a concentration of 10uM 10 | - noun: hERG inhibition at a concentration of 10uM 11 | - noun: hERG inhibition at 10uM 12 | - noun: human ether-à-go-go related gene (hERG) inhibition at a concentration of 10uM 13 | - noun: human ether-à-go-go related gene (hERG) inhibition at 10uM 14 | - noun: human ether-à-go-go related gene (hERG) inhibition at 10uM 15 | uris: 16 | - http://purl.obolibrary.org/obo/MI_2136 17 | identifiers: 18 | - id: SMILES 19 | type: SMILES 20 | description: SMILES 21 | license: CC BY 4.0 22 | links: 23 | - url: https://doi.org/10.1089/adt.2011.0425 24 | description: corresponding publication 25 | - url: https://bbirnbaum.com/ 26 | description: TDC Contributer 27 | - url: https://tdcommons.ai/single_pred_tasks/tox/#herg-central 28 | description: Data source 29 | num_points: 306893 30 | bibtex: 31 | - "@article{Du2011,\ndoi = {10.1089/adt.2011.0425},\nurl = {https://doi.org/10.1089/adt.2011.0425},\nyear = {2011},\nmonth = dec,\npublisher = {Mary Ann Liebert Inc},\nvolume = {9},\nnumber = {6},\npages = {580--588},\nauthor = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li},\ntitle = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development},\njournal = {ASSAY and Drug Development Technologies}" 32 | -------------------------------------------------------------------------------- /data/tabular/herg_central_at_1uM/meta.yaml: -------------------------------------------------------------------------------- 1 | name: herg_central_at_1uM 2 | description: "Human ether-à-go-go related gene (hERG) is crucial for the coordination\nof the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe\nadverse effects. Therefore, reliable prediction of hERG liability in the early\nstages of drug design is quite important to reduce the risk of cardiotoxicity-related\nattritions in the later development stages. There are three targets: hERG_at_1microM,\nhERG_at_10microM, and herg_inhib." 3 | targets: 4 | - id: herg_central_at_1uM 5 | description: the percent inhibition of hERG at a 1uM concentration 6 | units: "%" 7 | type: continuous 8 | names: 9 | - noun: hERG inhibition at a concentration of 1uM 10 | - noun: hERG inhibition at a concentration of 1uM 11 | - noun: hERG inhibition at 1uM 12 | - noun: human ether-à-go-go related gene (hERG) inhibition at a concentration of 1uM 13 | - noun: human ether-à-go-go related gene (hERG) inhibition at 1uM 14 | - noun: human ether-à-go-go related gene (hERG) inhibition at 1uM 15 | uris: 16 | - http://purl.obolibrary.org/obo/MI_2136 17 | identifiers: 18 | - id: SMILES 19 | type: SMILES 20 | description: SMILES 21 | license: CC BY 4.0 22 | links: 23 | - url: https://doi.org/10.1089/adt.2011.0425 24 | description: corresponding publication 25 | - url: https://bbirnbaum.com/ 26 | description: TDC Contributer 27 | - url: https://tdcommons.ai/single_pred_tasks/tox/#herg-central 28 | description: Data source 29 | num_points: 306893 30 | bibtex: 31 | - "@article{Du2011,\ndoi = {10.1089/adt.2011.0425},\nurl = {https://doi.org/10.1089/adt.2011.0425},\nyear = {2011},\nmonth = dec,\npublisher = {Mary Ann Liebert Inc},\nvolume = {9},\nnumber = {6},\npages = {580--588},\nauthor = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li},\ntitle = {hERGCentral: A Large Database to Store, Retrieve, and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development},\njournal = {ASSAY and Drug Development Technologies}" 32 | -------------------------------------------------------------------------------- /data/tabular/iupac_smiles/transform.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import fire 4 | import pandas as pd 5 | from datasets import load_dataset 6 | 7 | 8 | def process(debug=False): 9 | if not os.path.exists("combined_json.jsonl"): 10 | dataset = load_dataset("kjappelbaum/chemnlp_iupac_smiles") 11 | df = pd.DataFrame(dataset["train"]) 12 | else: 13 | file = "combined_json.jsonl" 14 | df = pd.read_json(file, lines=True) 15 | 16 | df.drop_duplicates(subset=["SMILES"], inplace=True) 17 | print(len(df)) 18 | 19 | if debug: 20 | df = df.sample(1000) 21 | 22 | df.to_csv("data_clean.csv", index=False) 23 | 24 | 25 | if __name__ == "__main__": 26 | fire.Fire(process) 27 | -------------------------------------------------------------------------------- /data/tabular/ld50_catmos/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def process(): 6 | file = hf_hub_download( 7 | repo_id="kjappelbaum/chemnlp-ld50catmos", 8 | filename="cleaned_ld50.csv", 9 | repo_type="dataset", 10 | ) 11 | df = pd.read_csv(file) 12 | print(len(df)) 13 | df[ 14 | [ 15 | "num_ghose_violations", 16 | "num_lead_likeness_violations", 17 | "num_lipinski_violations", 18 | "num_carbon_atoms", 19 | "num_oxygen_atoms", 20 | ] 21 | ] = df[ 22 | [ 23 | "num_ghose_violations", 24 | "num_lead_likeness_violations", 25 | "num_lipinski_violations", 26 | "num_carbon_atoms", 27 | "num_oxygen_atoms", 28 | ] 29 | ].astype( 30 | int 31 | ) 32 | df.to_csv("data_clean.csv", index=False) 33 | 34 | 35 | if __name__ == "__main__": 36 | process() 37 | -------------------------------------------------------------------------------- /data/tabular/ld50_zhu/meta.yaml: -------------------------------------------------------------------------------- 1 | name: ld50_zhu 2 | description: |- 3 | Acute toxicity LD50 measures 4 | the most conservative dose that can lead to lethal adverse effects. 5 | The higher the dose, the more lethal of a drug. 6 | targets: 7 | - id: acute_toxicity 8 | description: Acute Toxicity LD50. 9 | units: log10(1/(mol/kg)) 10 | type: continuous 11 | names: 12 | - noun: acute oral toxicity rat LD50 13 | - noun: acute oral toxicity (LD50 in rats) 14 | - noun: LD50 in rats (oral exposure) 15 | - noun: rat LD50 (oral exposure) 16 | uris: 17 | - http://www.bioassayontology.org/bao#BAO_0002117 18 | identifiers: 19 | - id: SMILES 20 | type: SMILES 21 | description: SMILES 22 | - id: compound_name 23 | type: Other 24 | description: compound name 25 | names: 26 | - noun: compound 27 | - noun: compound name 28 | - noun: drug 29 | license: CC BY 4.0 30 | links: 31 | - url: https://doi.org/10.1021/tx900189p 32 | description: corresponding publication 33 | benchmarks: 34 | - name: TDC 35 | link: https://tdcommons.ai/ 36 | split_column: split 37 | num_points: 7385 38 | bibtex: 39 | - |- 40 | @article{Zhu2009, 41 | doi = {10.1021/tx900189p}, 42 | url = {https://doi.org/10.1021/tx900189p}, 43 | year = {2009}, 44 | month = oct, 45 | publisher = {American Chemical Society ({ACS})}, 46 | volume = {22}, 47 | number = {12}, 48 | pages = {1913--1921}, 49 | author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander 50 | Sedykh and Douglas M. Young and Alexander Tropsha}, 51 | title = {Quantitative Structure-Activity Relationship Modeling 52 | of Rat Acute Toxicity by Oral Exposure}, 53 | journal = {Chemical Research in Toxicology}} 54 | -------------------------------------------------------------------------------- /data/tabular/mattermodeling_stackexchange/meta.yaml: -------------------------------------------------------------------------------- 1 | name: mattermodeling_stackexchange 2 | description: |- 3 | Questions and answers mined from mattermodeling.stackexchange.com. 4 | targets: 5 | - id: a 6 | description: answer to the question 7 | type: string 8 | - id: title 9 | description: title of the question 10 | type: string 11 | identifiers: 12 | - id: q 13 | type: string 14 | description: question asked on mattermodeling.stackexchange.com 15 | license: CC BY-SA 16 | links: 17 | - url: mattermodeling.stackexchange.com 18 | description: original data source 19 | - url: https://stackoverflow.com/help/licensing 20 | description: information about the license 21 | num_points: 664 22 | templates: 23 | - |- 24 | {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!} 25 | {#User: |Question: |Inquiry: |\n!}{#q} 26 | {#Assistant: |Answer: !}{#a} 27 | - |- 28 | {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!} 29 | {#Question: |Inquiry: |\n!}{#q} 30 | {#Assistant: |Title: |Answer: !}{#title} 31 | -------------------------------------------------------------------------------- /data/tabular/melting_points/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def preprocess(): 5 | df = pd.read_csv( 6 | "https://www.dropbox.com/scl/fi/op8hf1zcl8cin4zb3qj0s/ochem_clean.csv?rlkey=j41m2z1jk7o9hupec19gaxov9&dl=1" 7 | ) 8 | df = df.rename(columns={"Melting Point": "mp_range"}) 9 | df.dropna(subset=["mp", "NAME", "SMILES", "mp_range"], inplace=True) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | preprocess() 15 | -------------------------------------------------------------------------------- /data/tabular/merge.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | from pathlib import Path 4 | from typing import Union 5 | 6 | import fire 7 | import pandas as pd 8 | from tqdm import tqdm 9 | 10 | 11 | def merge_files(dir): 12 | fns = sorted(glob(os.path.join(dir, "data_clean-*.csv"))) 13 | fn_merged = os.path.join(dir, "data_clean.csv") 14 | if os.path.exists(fn_merged): 15 | os.remove(fn_merged) 16 | for fn in fns: 17 | df = pd.read_csv(fn, index_col=False, low_memory=False) 18 | df.to_csv( 19 | fn_merged, mode="a", index=False, header=not os.path.exists(fn_merged) 20 | ) 21 | os.remove(fn) 22 | del df 23 | 24 | 25 | def process_file(file: Union[str, Path]): 26 | dir = Path(file).parent 27 | # check if there is any csv file 28 | if not glob(os.path.join(dir, "*.csv")): 29 | return 30 | if len(glob(os.path.join(dir, "data_clean-0*.csv"))) >= 1: 31 | merge_files(dir) 32 | 33 | 34 | def process_all_files(data_dir): 35 | all_yaml_files = sorted(glob(os.path.join(data_dir, "**", "**", "meta.yaml"))) 36 | all_yaml_files = [f for f in all_yaml_files if "fda" in f] 37 | print(all_yaml_files) 38 | for yaml_file in tqdm(all_yaml_files): 39 | print(f"Processing {yaml_file}") 40 | try: 41 | process_file(yaml_file) 42 | except Exception as e: 43 | print(f"Could not process {yaml_file}: {e}") 44 | 45 | 46 | if __name__ == "__main__": 47 | fire.Fire(process_all_files) 48 | -------------------------------------------------------------------------------- /data/tabular/mofdscribe/meta.yaml: -------------------------------------------------------------------------------- 1 | name: mofdscribe 2 | description: |- 3 | Text descriptions of MOF structures. 4 | targets: 5 | - id: description 6 | description: description 7 | type: text 8 | names: 9 | - noun: description 10 | benchmarks: [] 11 | identifiers: 12 | - id: cif 13 | type: text 14 | description: CIFFILE 15 | names: 16 | - noun: CIF file 17 | - noun: Crystallographic Information File (CIF) 18 | - noun: CIF card 19 | license: CC BY 4.0 20 | links: 21 | - url: https://github.com/kjappelbaum/mofdscribe 22 | description: codebase used to generate this dataset 23 | num_points: 1267 24 | bibtex: 25 | - |- 26 | @article{Jablonka_2023, 27 | doi = {10.1021/acscentsci.2c01177}, 28 | url = {https://doi.org/10.1021%2Facscentsci.2c01177}, 29 | year = 2023, 30 | month = {mar}, 31 | publisher = {American Chemical Society ({ACS})}, 32 | volume = {9}, 33 | number = {4}, 34 | pages = {563--581}, 35 | author = {Kevin Maik Jablonka and Andrew S. Rosen and Aditi S. Krishnapriyan and Berend Smit}, 36 | title = {An Ecosystem for Digital Reticular Chemistry}, 37 | journal = {ACS Cent. Sci.} 38 | } 39 | - |- 40 | @article{Ganose_2019, 41 | doi = {10.1557/mrc.2019.94}, 42 | url = {https://doi.org/10.1557%2Fmrc.2019.94}, 43 | year = 2019, 44 | month = {sep}, 45 | publisher = {Springer Science and Business Media {LLC}}, 46 | volume = {9}, 47 | number = {3}, 48 | pages = {874--881}, 49 | author = {Alex M. Ganose and Anubhav Jain}, 50 | title = {Robocrystallographer: automated crystal structure text descriptions and analysis}, 51 | journal = {MRS Communications} 52 | } 53 | templates: 54 | - |- 55 | Task: {#Describe|Write a description of!} the structure with the {cif__names__noun} {cif#}. 56 | {#Answer: |A: |!}{description#} 57 | - |- 58 | Task: {#Create|Generate|Propose!} a {cif__names__noun} of a {#metal-organic framework|MOF|crystal structure|structure|material!} with the following description 59 | {description#}. 60 | {#Answer: |A: |!}{cif#} 61 | -------------------------------------------------------------------------------- /data/tabular/mofdscribe/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | from chemnlp.data.convert import remove_composition_rows 5 | 6 | 7 | def process(): 8 | file = hf_hub_download( 9 | repo_id="kjappelbaum/chemnlp-text-mofdscribe", 10 | filename="data/train-00000-of-00001-ccae794e6d461778.parquet", 11 | repo_type="dataset", 12 | ) 13 | df = pd.read_parquet(file) 14 | print(len(df)) 15 | df["cif"] = df["cif"].apply(remove_composition_rows) 16 | df.to_csv("data_clean.csv", index=False) 17 | 18 | 19 | if __name__ == "__main__": 20 | process() 21 | -------------------------------------------------------------------------------- /data/tabular/mol2svg/meta.yaml: -------------------------------------------------------------------------------- 1 | name: mol2svg 2 | description: |- 3 | This dataset contains SVG images of molecules, including some with substructures 4 | highlighted. 5 | targets: 6 | - id: completion 7 | type: text 8 | description: completion 9 | identifiers: 10 | - id: prompt 11 | type: text 12 | description: prompt 13 | - id: SMILES 14 | type: SMILES 15 | description: SMILES 16 | license: CC BY 4.0 17 | num_points: 16019 18 | links: 19 | - url: https://github.com/lamalab-org/chem-caption 20 | description: Original codebase used to generate this dataset 21 | templates: 22 | - |- 23 | {prompt#} 24 | {completion#} 25 | -------------------------------------------------------------------------------- /data/tabular/mol2svg/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def preprocess(): 5 | dataset = load_dataset("kjappelbaum/chemnlp-mol-svg") 6 | df = dataset["train"].to_pandas() 7 | df.dropna(inplace=True) 8 | print(len(df)) 9 | df.to_csv("data_clean.csv", index=False) 10 | 11 | 12 | if __name__ == "__main__": 13 | preprocess() 14 | -------------------------------------------------------------------------------- /data/tabular/moses/meta.yaml: -------------------------------------------------------------------------------- 1 | name: moses 2 | description: |- 3 | Molecular Sets (MOSES) is a benchmark platform 4 | for distribution learning based molecule generation. 5 | Within this benchmark, MOSES provides a cleaned dataset of molecules that are ideal of optimization. 6 | It is processed from the ZINC Clean Leads dataset. 7 | benchmarks: 8 | - name: TDC 9 | link: https://tdcommons.ai/ 10 | split_column: split 11 | identifiers: 12 | - id: SMILES 13 | type: SMILES 14 | description: SMILES 15 | license: CC BY 4.0 16 | links: 17 | - url: https://arxiv.org/abs/1811.12823 18 | description: Article about original dataset 19 | - url: https://pubs.acs.org/doi/abs/10.1021/acs.jcim.5b00559 20 | description: Link to publication of associated dataset - zinc 21 | - url: https://github.com/molecularsets/moses 22 | description: Github repository concerning the dataset 23 | num_points: 1936962 24 | bibtex: 25 | - |- 26 | @article{10.3389/fphar.2020.565644, 27 | title={{M}olecular {S}ets ({MOSES}): {A} {B}enchmarking {P}latform for {M}olecular {G}eneration {M}odels}, 28 | author={Polykovskiy, Daniil and Zhebrak, Alexander and Sanchez-Lengeling, Benjamin and Golovanov, 29 | Sergey and Tatanov, Oktai and Belyaev, Stanislav and Kurbanov, Rauf and Artamonov, 30 | Aleksey and Aladinskiy, Vladimir and Veselov, Mark and Kadurin, Artur and Johansson, 31 | Simon and Chen, Hongming and Nikolenko, Sergey and Aspuru-Guzik, Alan and Zhavoronkov, Alex}, 32 | journal={Frontiers in Pharmacology}, 33 | year={2020} 34 | } 35 | -------------------------------------------------------------------------------- /data/tabular/mp_anisotropy/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def transform(): 5 | dataset = load_dataset("kjappelbaum/chemnlp-mp-elastic-anisotropy")["train"] 6 | df = dataset.to_pandas() 7 | print(len(df)) 8 | df[["formula", "elastic_anisotropy", "split"]].to_csv("data_clean.csv", index=False) 9 | 10 | 11 | if __name__ == "__main__": 12 | transform() 13 | -------------------------------------------------------------------------------- /data/tabular/mp_bulk_modulus/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def transform(): 5 | dataset = load_dataset("kjappelbaum/chemnlp-mp-bulk-modulus")["train"] 6 | df = dataset.to_pandas() 7 | print(len(df)) 8 | df[["formula", "bulk_modulus", "split"]].to_csv("data_clean.csv", index=False) 9 | 10 | 11 | if __name__ == "__main__": 12 | transform() 13 | -------------------------------------------------------------------------------- /data/tabular/mp_descriptions/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | from chemnlp.data.convert import remove_composition_rows 4 | 5 | 6 | def process(): 7 | dataset = load_dataset("kjappelbaum/chemnlp-robocrys") 8 | df = dataset["train"].to_pandas() 9 | df.dropna( 10 | subset=["cifstr", "description", "description_w_bondlengths"], inplace=True 11 | ) 12 | df["cifstr"] = df["cifstr"].apply(remove_composition_rows) 13 | print(len(df)) 14 | df.to_csv("data_clean.csv", index=False) 15 | 16 | 17 | if __name__ == "__main__": 18 | process() 19 | -------------------------------------------------------------------------------- /data/tabular/mp_self_supervised/prepare_data.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | from glob import glob 3 | 4 | import pandas as pd 5 | from pymatgen.core import Structure 6 | from tqdm import tqdm 7 | 8 | from chemnlp.data.convert import cif_file_to_string, is_longer_than_allowed 9 | 10 | data = [] 11 | 12 | 13 | def compile_info(ciffile): 14 | s = Structure.from_file(ciffile) 15 | cif = cif_file_to_string(ciffile) 16 | sg, sg_n = s.get_space_group_info() 17 | 18 | d = { 19 | "formula": s.composition.reduced_formula, 20 | "density": s.density, 21 | "spacegroup": sg, 22 | "spacegroup_number": sg_n, 23 | "cif": cif, 24 | "is_longer_than_allowed": is_longer_than_allowed(cif), 25 | } 26 | return d 27 | 28 | 29 | if __name__ == "__main__": 30 | all_structures = glob("structures/*.cif") # assumes structures have been downloaded 31 | 32 | data = [] 33 | with concurrent.futures.ProcessPoolExecutor() as executor: 34 | for d in tqdm( 35 | executor.map(compile_info, all_structures), total=len(all_structures) 36 | ): 37 | data.append(d) 38 | 39 | df = pd.DataFrame(data) 40 | df.to_json("mpid.json") 41 | -------------------------------------------------------------------------------- /data/tabular/mp_self_supervised/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def remove_composition_from_cif(cif): 5 | # in the second line of cif, split at _ and then take the first element and join it with "cif" 6 | parts = cif.split("\n") 7 | parts[1] = "data_cif" 8 | return "\n".join(parts) 9 | 10 | 11 | def process(): 12 | df = pd.read_json( 13 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-mp-cifs/resolve/main/mpid.json" 14 | ) 15 | df = df.query("is_longer_than_allowed==False").dropna() 16 | df["cif"] = df["cif"].apply(remove_composition_from_cif) 17 | print(len(df)) 18 | df.to_csv("data_clean.csv", index=False) 19 | 20 | 21 | if __name__ == "__main__": 22 | process() 23 | -------------------------------------------------------------------------------- /data/tabular/mp_shear_modulus/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def transform(): 5 | dataset = load_dataset("kjappelbaum/chemnlp-mp-shear-modulus")["train"] 6 | df = dataset.to_pandas() 7 | print(len(df)) 8 | df[["formula", "shear_modulus", "split"]].to_csv("data_clean.csv", index=False) 9 | 10 | 11 | if __name__ == "__main__": 12 | transform() 13 | -------------------------------------------------------------------------------- /data/tabular/ncbi_disease/meta.yaml: -------------------------------------------------------------------------------- 1 | name: ncbi_disease 2 | description: |- 3 | ncbi_disease is a named entity recognition dataset for disease mentions. 4 | targets: 5 | - id: matched_words 6 | description: matched words 7 | type: text 8 | names: 9 | - noun: entity 10 | - noun: matched entity 11 | identifiers: 12 | - id: sentence 13 | description: Sentence 14 | type: text 15 | names: 16 | - noun: sentence 17 | - noun: text 18 | license: https://huggingface.co/datasets/bigbio/blurb/blob/main/LICENSE 19 | links: 20 | - url: https://huggingface.co/datasets/bigbio/blurb 21 | description: original dataset 22 | benchmarks: 23 | - name: ncbi_disease 24 | link: hhttps://huggingface.co/datasets/bigbio/blurb 25 | split_column: split 26 | num_points: 7075 27 | bibtex: 28 | - |- 29 | @article{gu2021domain, 30 | title = { 31 | Domain-specific language model pretraining for biomedical natural 32 | language processing 33 | }, 34 | author = { 35 | Gu, Yu and Tinn, Robert and Cheng, Hao and Lucas, Michael and 36 | Usuyama, Naoto and Liu, Xiaodong and Naumann, Tristan and Gao, 37 | Jianfeng and Poon, Hoifung 38 | }, 39 | year = 2021, 40 | journal = {ACM Transactions on Computing for Healthcare (HEALTH)}, 41 | publisher = {ACM New York, NY}, 42 | volume = 3, 43 | number = 1, 44 | pages = {1--23} 45 | } 46 | templates: 47 | - |- 48 | Task: Find all the mentions of diseases in the {#following|subsequent!} {#text|sentence!}. Return the matching {#words|entities!}. If there is no {#match|mention of a disease|matching entity!}, return `no match`. 49 | {#Sentence|Description!}: {sentence#} 50 | Answer: {matched_words#} 51 | - |- 52 | User: Does the following text contain mentions of diseases?{# Can you return matches?| Can you output matches?!} 53 | {#Text: |!}{sentence#} 54 | Assistant: {#I found|There is!} {matched_words#} 55 | -------------------------------------------------------------------------------- /data/tabular/ncbi_disease/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datasets import load_dataset 3 | 4 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner 5 | from chemnlp.data.utils import oxford_comma_join 6 | 7 | 8 | def process(): 9 | # tokenized at whitespaces and punctuations 10 | dataset = load_dataset("bigbio/blurb", "ncbi_disease") 11 | dfs = [] 12 | for split in ["train", "validation", "test"]: 13 | df_ = dataset[split].to_pandas() 14 | df_["split"] = split 15 | dfs.append(df_) 16 | df = pd.concat(dfs) 17 | ner_labels = df["ner_tags"] 18 | 19 | matched_words = [] 20 | for tokens, ner_label in zip(df["tokens"], ner_labels): 21 | words = group_tokens_by_labels(tokens, ner_label) 22 | if len(words) == 0: 23 | matched_words.append("no match") 24 | else: 25 | matched_words.append(oxford_comma_join(words)) 26 | 27 | df["matched_words"] = matched_words 28 | df["sentence"] = df["tokens"].apply(punctuation_joiner) 29 | 30 | df = df[["sentence", "matched_words"]] 31 | 32 | # ensure we have at least 5 words in a sentence 33 | df = df[df["sentence"].apply(lambda x: len(x.split()) >= 5)] 34 | 35 | print(len(df)) 36 | df.to_csv("data_clean.csv", index=False) 37 | 38 | 39 | if __name__ == "__main__": 40 | process() 41 | -------------------------------------------------------------------------------- /data/tabular/nomad_structure/transform.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import pandas as pd 3 | 4 | from chemnlp.data.convert import mask_cif_lines, remove_composition_rows 5 | 6 | DATASET_NAME = "nomad-structure" 7 | 8 | 9 | def prepare_data(): 10 | dataset_name = "n0w0f/nomad-structure-csv" 11 | split_name = "train" # data without any split @ hf 12 | filename_to_save = "data_clean.csv" 13 | 14 | # Load the dataset from Hugging Face 15 | dataset = datasets.load_dataset(dataset_name, split=split_name) 16 | 17 | df = pd.DataFrame(dataset) 18 | df = df[~df["is_longer_than_allowed"]] 19 | # assert column names 20 | fields_orig = df.columns.tolist() 21 | assert fields_orig == [ 22 | "cif", 23 | "formula", 24 | "spacegroup", 25 | "spacegroup_number", 26 | "crystal_system", 27 | "pointgroup", 28 | "density", 29 | "is_longer_than_allowed", 30 | ] 31 | df["cif"] = df["cif"].apply(remove_composition_rows) 32 | df["cif_masked"] = df["cif"].apply(mask_cif_lines) 33 | # remove duplicates if any 34 | df = df.drop_duplicates() 35 | df.dropna(inplace=True) 36 | df.to_csv(filename_to_save, index=False) 37 | datapoints = len(df) 38 | return datapoints 39 | 40 | 41 | if __name__ == "__main__": 42 | print(f" Preparing clean tabular {DATASET_NAME} datatset") 43 | datapoints = prepare_data() 44 | print( 45 | f" Finished Preparing clean tabular {DATASET_NAME} datatset with {datapoints} datapoints" 46 | ) 47 | -------------------------------------------------------------------------------- /data/tabular/ocp/meta.yaml: -------------------------------------------------------------------------------- 1 | name: ocp 2 | description: |- 3 | CatBerta training data. 4 | targets: 5 | - id: target 6 | description: target 7 | type: continuous 8 | units: eV 9 | significant_digits: 4 10 | names: 11 | - noun: adsorption energy 12 | identifiers: 13 | - id: text 14 | type: text 15 | description: description 16 | license: MIT (based on ocp) 17 | links: 18 | - url: https://drive.google.com/drive/folders/1puiJ9FbLEA3QIHmZromecEndlemag9hg?usp=sharing 19 | description: original data source 20 | num_points: 125000 21 | bibtex: 22 | - |- 23 | @article{ock2023catalyst, 24 | title={Catalyst Property Prediction with CatBERTa: Unveiling Feature Exploration Strategies through Large Language Models}, 25 | author={Ock, Janghoon and Guntuboina, Chakradhar and Farimani, Amir Barati}, 26 | journal={arXiv preprint arXiv:2309.00563}, 27 | year={2023} 28 | } 29 | - |- 30 | @article{ocp_dataset, 31 | author = {Chanussot*, Lowik and Das*, Abhishek and Goyal*, Siddharth and Lavril*, Thibaut and Shuaibi*, Muhammed and Riviere, Morgane and Tran, Kevin and Heras-Domingo, Javier and Ho, Caleb and Hu, Weihua and Palizhati, Aini and Sriram, Anuroop and Wood, Brandon and Yoon, Junwoong and Parikh, Devi and Zitnick, C. Lawrence and Ulissi, Zachary}, 32 | title = {Open Catalyst 2020 (OC20) Dataset and Community Challenges}, 33 | journal = {ACS Catalysis}, 34 | year = {2021}, 35 | doi = {10.1021/acscatal.0c04525}, 36 | } 37 | templates: 38 | - |- 39 | Question: What is the adsorption energy of the following adsorbate-adsorbent pair? 40 | Text: {text#} 41 | Answer: {target#} {target__units} 42 | - |- 43 | Task: {#Predict|Estimate|Calculate|Compute|Determine!} the adsorption energy of the following adsorbate-adsorbent pair. 44 | Text: {text#} 45 | Answer: {target#} {target__units} 46 | -------------------------------------------------------------------------------- /data/tabular/ocp/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datasets import load_dataset 3 | from pylatexenc.latexencode import unicode_to_latex 4 | 5 | 6 | def uniCode2Latex(text: str) -> str: 7 | """ 8 | converts unicode text to latex and 9 | fixes UTF-8 chars for latex in a certain range: 10 | ₀:$_0$ ... ₉:$_9$ 11 | 12 | see https://github.com/phfaist/pylatexenc/issues/72 13 | 14 | Args: 15 | text(str): the string to fix 16 | 17 | Return: 18 | str: latex presentation of UTF-8 char 19 | """ 20 | for code in range(8320, 8330): 21 | text = text.replace(chr(code), f"$_{code-8320}$") 22 | 23 | text = text.replace("\u0305", "$^-$") 24 | text = text.replace("\u207A", "$^+$") 25 | text = text.replace("\u207B", "$^-$") 26 | text = text.replace("\u2074", "$^4$") 27 | text = text.replace("\u2070", "$^0$") 28 | text = text.replace("\u2078", "$^1$") 29 | text = text.replace("\u2075", "$^2$") 30 | text = text.replace("\u2076", "$^3$") 31 | text = text.replace("\u2077", "$^5$") 32 | 33 | return unicode_to_latex(text) 34 | 35 | 36 | def process(): 37 | dataset = load_dataset("kjappelbaum/chemnlp-ocp") 38 | df_train = dataset["train"].to_pandas() 39 | df_val = dataset["valid"].to_pandas() 40 | 41 | df_train["split"] = "train" 42 | df_val["split"] = "valid" 43 | 44 | df = pd.concat([df_train, df_val]) 45 | df["text"] = df["text"].apply(uniCode2Latex) 46 | print(len(df)) 47 | df.to_csv("data_clean.csv", index=False) 48 | 49 | 50 | if __name__ == "__main__": 51 | process() 52 | -------------------------------------------------------------------------------- /data/tabular/opv/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def process(): 5 | dataset = load_dataset("kjappelbaum/chemnlp-opv")["train"] 6 | df = dataset.to_pandas() 7 | df["LUMO"] = -1 * df["-LUMO (eV)"] 8 | df["HOMO"] = -1 * df["-HOMO (eV)"] 9 | df.rename( 10 | columns={ 11 | "PCE_ave(%)": "PCE_ave", 12 | "Voc (V)": "Voc", 13 | "Jsc (mA cm^2)": "Jsc", 14 | "Mw (kg mol^-1)": "Mw", 15 | "Mn (kg mol^-1)": "Mn", 16 | "PDI (=Mw/Mn)": "PDI", 17 | "bandgap(eV)": "bandgap", 18 | }, 19 | inplace=True, 20 | ) 21 | 22 | df = df.dropna( 23 | subset=[ 24 | "HOMO", 25 | "LUMO", 26 | "Mw", 27 | "PDI", 28 | "FF", 29 | "Jsc", 30 | "Voc", 31 | "PCE_ave", 32 | "bandgap", 33 | ] 34 | ) 35 | print(len(df)) 36 | df.to_csv("data_clean.csv", index=False) 37 | 38 | 39 | if __name__ == "__main__": 40 | process() 41 | -------------------------------------------------------------------------------- /data/tabular/oqmd/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def process(): 5 | dataset = load_dataset("kjappelbaum/chemnlp-oqmd")["train"] 6 | df = dataset.to_pandas() 7 | 8 | df.dropna( 9 | subset=[ 10 | "name", 11 | "formula", 12 | "spacegroup", 13 | "nelements", 14 | "nsites", 15 | "energy_per_atom", 16 | "formation_energy_per_atom", 17 | "band_gap", 18 | "volume_per_atom", 19 | "magnetization_per_atom", 20 | "atomic_volume_per_atom", 21 | ], 22 | inplace=True, 23 | ) 24 | print(len(df)) 25 | df.to_csv("data_clean.csv", index=False) 26 | 27 | 28 | if __name__ == "__main__": 29 | process() 30 | -------------------------------------------------------------------------------- /data/tabular/orbnet_denali/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def process(): 5 | dataset = load_dataset("kjappelbaum/chemnlp-orbnet-denali") 6 | df = dataset["train"].to_pandas() 7 | df = df.dropna() 8 | print(len(df)) 9 | df.rename(columns={"smiles": "SMILES"}, inplace=True) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | process() 15 | -------------------------------------------------------------------------------- /data/tabular/ord_masked/meta.yaml: -------------------------------------------------------------------------------- 1 | name: ord_rxn_smiles_yield_pred 2 | description: |- 3 | The open reaction database is a database of chemical reactions and their conditions 4 | identifiers: 5 | - id: masked_rxn_smiles 6 | type: text 7 | description: reaction SMILES with one element masked 8 | names: 9 | - noun: reaction SMILES with one element masked as `MASK` 10 | - noun: reaction SMILES with one element hidden as `MASK` 11 | - noun: masked reaction SMILES (one component masked as `MASK`) 12 | - noun: masked reaction SMILES string (one component masked as `MASK`) 13 | - noun: masked RXNSMILES (one component masked as `MASK`) 14 | targets: 15 | - id: missing_component 16 | type: text 17 | description: masked element 18 | license: CC BY SA 4.0 19 | links: 20 | - url: https://github.com/open-reaction-database/ord-data 21 | description: original data source 22 | num_points: 2263983 23 | bibtex: 24 | - |- 25 | @article{Kearnes_2021, 26 | doi = {10.1021/jacs.1c09820}, 27 | url = {https://doi.org/10.1021%2Fjacs.1c09820}, 28 | year = 2021, 29 | month = {nov}, 30 | publisher = {American Chemical Society ({ACS})}, 31 | volume = {143}, 32 | number = {45}, 33 | pages = {18820--18826}, 34 | author = {Steven M. Kearnes and Michael R. Maser 35 | and Michael Wleklinski and Anton Kast and Abigail G. Doyle 36 | and Spencer D. Dreher and Joel M. Hawkins 37 | and Klavs F. Jensen and Connor W. Coley}, 38 | title = {The Open Reaction Database}, 39 | journal = {J. Am. Chem. Soc.} 40 | } 41 | templates: 42 | - The masked component in the {masked_rxn_smiles__names__noun} {masked_rxn_smiles#} is {missing_component#}. 43 | - The {#chemical|compound!} with SMILES {missing_component#} is the masked component in the {masked_rxn_smiles__names__noun} {masked_rxn_smiles#}. 44 | - |- 45 | Question: What is the masked component in the {masked_rxn_smiles__names__noun} {masked_rxn_smiles#}? 46 | Answer: {missing_component#}. 47 | - |- 48 | Task: Predict the masked component in a {masked_rxn_smiles__names__noun}. 49 | Description: {masked_rxn_smiles#} 50 | {#Answer|Solution!}: {missing_component#} 51 | -------------------------------------------------------------------------------- /data/tabular/ord_masked/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def process(): 6 | file = hf_hub_download( 7 | repo_id="kjappelbaum/chemnlp-ord", 8 | filename="ord_rxn.json", 9 | repo_type="dataset", 10 | ) 11 | df = pd.read_json(file) 12 | df.dropna(subset=["masked_rxn_smiles", "missing_component"], inplace=True) 13 | print(len(df)) 14 | df.to_csv("data_clean.csv", index=False) 15 | 16 | 17 | if __name__ == "__main__": 18 | process() 19 | -------------------------------------------------------------------------------- /data/tabular/ord_predictions/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def oxford_comma_join(elements): 6 | try: 7 | if len(elements) == 1: 8 | return elements[0] 9 | elif len(elements) == 2: 10 | return " and ".join(elements) 11 | else: 12 | return ", ".join(elements[:-1]) + ", and " + elements[-1] 13 | except Exception: 14 | return None 15 | 16 | 17 | def process(): 18 | file = hf_hub_download( 19 | repo_id="kjappelbaum/chemnlp-ord", 20 | filename="ord_rxn.json", 21 | repo_type="dataset", 22 | ) 23 | df = pd.read_json(file) 24 | df["educt_string"] = df["educts"].apply(oxford_comma_join) 25 | df["product_string"] = df["products"].apply(oxford_comma_join) 26 | df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True) 27 | df.dropna(subset=["educt_string", "product_string"], inplace=True) 28 | print(len(df)) 29 | df[["RXNSMILES", "educt_string", "product_string"]].to_csv( 30 | "data_clean.csv", index=False 31 | ) 32 | 33 | 34 | if __name__ == "__main__": 35 | process() 36 | -------------------------------------------------------------------------------- /data/tabular/ord_procedure_steps/meta.yaml: -------------------------------------------------------------------------------- 1 | name: ord_procedure_steps 2 | description: |- 3 | The open reaction database is a database of chemical reactions and their conditions 4 | identifiers: 5 | - id: steps_string 6 | type: text 7 | description: reaction action sequence 8 | names: 9 | - noun: reaction action sequence 10 | - noun: reaction action steps 11 | targets: 12 | - id: procedure 13 | type: text 14 | description: reaction procedure 15 | names: 16 | - noun: reaction procedure 17 | - noun: description of reaction procedure 18 | - noun: reaction procedure description 19 | - noun: procedure 20 | license: CC BY SA 4.0 21 | links: 22 | - url: https://github.com/open-reaction-database/ord-data 23 | description: original data source 24 | num_points: 76815 25 | bibtex: 26 | - |- 27 | @article{Kearnes_2021, 28 | doi = {10.1021/jacs.1c09820}, 29 | url = {https://doi.org/10.1021%2Fjacs.1c09820}, 30 | year = 2021, 31 | month = {nov}, 32 | publisher = {American Chemical Society ({ACS})}, 33 | volume = {143}, 34 | number = {45}, 35 | pages = {18820--18826}, 36 | author = {Steven M. Kearnes and Michael R. Maser 37 | and Michael Wleklinski and Anton Kast and Abigail G. Doyle 38 | and Spencer D. Dreher and Joel M. Hawkins 39 | and Klavs F. Jensen and Connor W. Coley}, 40 | title = {The Open Reaction Database}, 41 | journal = {J. Am. Chem. Soc.} 42 | } 43 | templates: 44 | - |- 45 | User: {#Can you|Could you!} {#tell me|give me|show me!} the {procedure__names__noun} for the {steps_string__names__noun} {steps_string#}? 46 | Assistant: {#I propose|I suggest!} the {procedure__names__noun} {procedure#} 47 | - |- 48 | User: {#Can you|Could you!} {#tell me|give me|show me!} the {steps_string__names__noun} for the {procedure__names__noun} {procedure#}? 49 | Assistant: {#I propose|I suggest!} the {steps_string__names__noun} {steps_string#} 50 | - |- 51 | Task: Convert a {procedure__names__noun} into a {steps_string__names__noun}. 52 | Procedure: {procedure#} 53 | Answer: {steps_string#} 54 | -------------------------------------------------------------------------------- /data/tabular/ord_procedure_steps/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def process(): 6 | file = hf_hub_download( 7 | repo_id="kjappelbaum/chemnlp-ord", 8 | filename="ord_data_compiled.json", 9 | repo_type="dataset", 10 | ) 11 | df = pd.read_json(file) 12 | df = df.dropna(subset=["steps_string", "procedure"]) 13 | df.query("steps_string != 'None'", inplace=True) 14 | df.query("procedure != 'None'", inplace=True) 15 | df = df[["steps_string", "procedure"]] 16 | print(len(df)) 17 | df.to_csv("data_clean.csv", index=False) 18 | 19 | 20 | if __name__ == "__main__": 21 | process() 22 | -------------------------------------------------------------------------------- /data/tabular/ord_rxn_smiles_procedure/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | from rxn.chemutils.reaction_equation import rxn_standardization 4 | from rxn.chemutils.reaction_smiles import parse_any_reaction_smiles 5 | 6 | 7 | def canoncialize_rxn_smiles(rxn_smiles): 8 | try: 9 | return rxn_standardization(parse_any_reaction_smiles(rxn_smiles)).to_string() 10 | except Exception: 11 | return None 12 | 13 | 14 | def process(): 15 | file = hf_hub_download( 16 | repo_id="kjappelbaum/chemnlp-ord", 17 | filename="ord_data_compiled.json", 18 | repo_type="dataset", 19 | ) 20 | df = pd.read_json(file) 21 | df["canonical_rxn_smiles"] = df["rxn_smiles"].apply(canoncialize_rxn_smiles) 22 | df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True) 23 | df = df.dropna(subset=["RXNSMILES", "procedure"]) 24 | df = df.query("RXNSMILES != 'None'") 25 | # make sure RXNSMILES values have at least 10 characters 26 | df = df[df["RXNSMILES"].str.len() > 10] 27 | # there must be > in the reaction SMILES 28 | df = df[df["RXNSMILES"].str.contains(">")] 29 | df = df.query("procedure != 'None'") 30 | df.query( 31 | "steps_string != 'None'", inplace=True 32 | ) # this removes cases in which is just says "follow the procedure above" 33 | df = df.query("procedure != ''") 34 | df = df[["RXNSMILES", "procedure"]] 35 | print(len(df)) 36 | df.to_csv("data_clean.csv", index=False) 37 | 38 | 39 | if __name__ == "__main__": 40 | process() 41 | -------------------------------------------------------------------------------- /data/tabular/ord_rxn_smiles_yield_pred/meta.yaml: -------------------------------------------------------------------------------- 1 | name: ord_rxn_smiles_yield_pred 2 | description: |- 3 | The open reaction database is a database of chemical reactions and their conditions 4 | targets: 5 | - id: yield 6 | type: continuous 7 | significant_digits: 0 8 | description: reaction yield 9 | units: \% 10 | names: 11 | - noun: yield 12 | - noun: reaction yield 13 | identifiers: 14 | - id: RXNSMILES 15 | type: RXNSMILES 16 | description: reaction SMILES 17 | names: 18 | - noun: reaction SMILES 19 | - noun: reaction SMILES string 20 | - noun: RXNSMILES 21 | - noun: reaction SMILES (RXNSMILES) 22 | license: CC BY SA 4.0 23 | links: 24 | - url: https://github.com/open-reaction-database/ord-data 25 | description: original data source 26 | num_points: 28 27 | bibtex: 28 | - |- 29 | @article{Kearnes_2021, 30 | doi = {10.1021/jacs.1c09820}, 31 | url = {https://doi.org/10.1021%2Fjacs.1c09820}, 32 | year = 2021, 33 | month = {nov}, 34 | publisher = {American Chemical Society ({ACS})}, 35 | volume = {143}, 36 | number = {45}, 37 | pages = {18820--18826}, 38 | author = {Steven M. Kearnes and Michael R. Maser 39 | and Michael Wleklinski and Anton Kast and Abigail G. Doyle 40 | and Spencer D. Dreher and Joel M. Hawkins 41 | and Klavs F. Jensen and Connor W. Coley}, 42 | title = {The Open Reaction Database}, 43 | journal = {J. Am. Chem. Soc.} 44 | } 45 | templates: 46 | - The {yield__names__noun} of a reaction with the {RXNSMILES__names__noun} {RXNSMILES#} is {yield#}{yield__units}. 47 | - |- 48 | User: {#I need|I want|I would like!} to run a reaction with the {RXNSMILES__names__noun} {RXNSMILES#}. What is the {yield__names__noun} {#I can expect|I should expect|I should get|I can get!}? 49 | Assistant: {#The|The expected|The predicted|The estimated!} {yield__names__noun} is {yield#}{yield__units}. 50 | - |- 51 | Question: {#What is|What's|What is the|What's the!} {yield__names__noun} of a reaction with the {RXNSMILES__names__noun} {RXNSMILES#}? 52 | Answer: {yield#}{yield__units}. 53 | -------------------------------------------------------------------------------- /data/tabular/ord_rxn_smiles_yield_pred/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | from rxn.chemutils.reaction_equation import rxn_standardization 4 | from rxn.chemutils.reaction_smiles import parse_any_reaction_smiles 5 | 6 | 7 | def canoncialize_rxn_smiles(rxn_smiles): 8 | try: 9 | return rxn_standardization(parse_any_reaction_smiles(rxn_smiles)).to_string() 10 | except Exception: 11 | return None 12 | 13 | 14 | def process(): 15 | file = hf_hub_download( 16 | repo_id="kjappelbaum/chemnlp-ord", 17 | filename="ord_data_compiled.json", 18 | repo_type="dataset", 19 | ) 20 | df = pd.read_json(file) 21 | df["canonical_rxn_smiles"] = df["rxn_smiles"].apply(canoncialize_rxn_smiles) 22 | df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True) 23 | df = df.dropna(subset=["RXNSMILES", "yield"]) 24 | # make sure RXNSMILES values have at least 10 characters 25 | df = df[df["RXNSMILES"].str.len() > 10] 26 | # there must be > in the reaction SMILES 27 | df = df[df["RXNSMILES"].str.contains(">")] 28 | df.query( 29 | "steps_string != 'None'", inplace=True 30 | ) # this removes cases in which is just says "follow the procedure above" 31 | df = df.query("RXNSMILES != 'None'") 32 | df = df[["RXNSMILES", "yield"]] 33 | print(len(df)) 34 | df.to_csv("data_clean.csv", index=False) 35 | 36 | 37 | if __name__ == "__main__": 38 | process() 39 | -------------------------------------------------------------------------------- /data/tabular/ord_steps_yield/meta.yaml: -------------------------------------------------------------------------------- 1 | name: ord_steps_yield 2 | description: |- 3 | The open reaction database is a database of chemical reactions and their conditions 4 | identifiers: 5 | - id: non_yield_steps_string 6 | type: text 7 | description: reaction action sequence 8 | names: 9 | - noun: reaction action sequence 10 | - noun: reaction action steps 11 | targets: 12 | - id: yield 13 | type: continuous 14 | significant_digits: 0 15 | description: reaction yield 16 | units: \% 17 | names: 18 | - noun: yield 19 | - noun: reaction yield 20 | license: CC BY SA 4.0 21 | links: 22 | - url: https://github.com/open-reaction-database/ord-data 23 | description: original data source 24 | num_points: 30 25 | bibtex: 26 | - |- 27 | @article{Kearnes_2021, 28 | doi = {10.1021/jacs.1c09820}, 29 | url = {https://doi.org/10.1021%2Fjacs.1c09820}, 30 | year = 2021, 31 | month = {nov}, 32 | publisher = {American Chemical Society ({ACS})}, 33 | volume = {143}, 34 | number = {45}, 35 | pages = {18820--18826}, 36 | author = {Steven M. Kearnes and Michael R. Maser 37 | and Michael Wleklinski and Anton Kast and Abigail G. Doyle 38 | and Spencer D. Dreher and Joel M. Hawkins 39 | and Klavs F. Jensen and Connor W. Coley}, 40 | title = {The Open Reaction Database}, 41 | journal = {J. Am. Chem. Soc.} 42 | } 43 | templates: 44 | - |- 45 | The {yield__names__noun} of a reaction with the {non_yield_steps_string__names__noun} below is {yield#}{yield__units}. 46 | {non_yield_steps_string__names__noun}: {non_yield_steps_string#} 47 | - |- 48 | User: {#I need|I want|I would like!} to run a reaction with the {non_yield_steps_string__names__noun} {non_yield_steps_string#}. What is the {yield__names__noun} {#I can expect|I should expect|I should get|I can get!}? 49 | Assistant: {#The|The expected|The predicted|The estimated!} {yield__names__noun} is {yield#}{yield__units}. 50 | - |- 51 | Task: {#Predict|Estimate!} the {yield__names__noun} of a reaction based on the {non_yield_steps_string__names__noun}. 52 | Description: {non_yield_steps_string#} 53 | Answer: {yield#}{yield__units} 54 | -------------------------------------------------------------------------------- /data/tabular/ord_steps_yield/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def process(): 6 | file = hf_hub_download( 7 | repo_id="kjappelbaum/chemnlp-ord", 8 | filename="ord_data_compiled.json", 9 | repo_type="dataset", 10 | ) 11 | df = pd.read_json(file) 12 | df = df.dropna(subset=["non_yield_steps_string", "yield"]) 13 | df = df.query("non_yield_steps_string != 'None'") 14 | df = df[["non_yield_steps_string", "yield"]] 15 | print(len(df)) 16 | df.to_csv("data_clean.csv", index=False) 17 | 18 | 19 | if __name__ == "__main__": 20 | process() 21 | -------------------------------------------------------------------------------- /data/tabular/perovskite_db/transform.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | from datasets import load_dataset 4 | 5 | 6 | def oxford_comma_join(list_of_str): 7 | if len(list_of_str) == 1: 8 | return list_of_str[0] 9 | elif len(list_of_str) == 2: 10 | return " and ".join(list_of_str) 11 | else: 12 | return ", ".join(list_of_str[:-1]) + ", and " + list_of_str[-1] 13 | 14 | 15 | def preprocess(): 16 | df = load_dataset("kjappelbaum/pervoskite_db", delimiter="|")["train"].to_pandas() 17 | df.dropna( 18 | subset=[ 19 | "device_stack", 20 | "pce", 21 | "ff", 22 | "jsc", 23 | "voc", 24 | "reduced_formulas", 25 | "descriptive_formulas", 26 | "iupac_formulas", 27 | "bandgap", 28 | ], 29 | inplace=True, 30 | ) 31 | device_stack_strings = [] 32 | 33 | df["pce"] = df["pce"].round(2) 34 | df["ff"] = df["ff"].round(2) 35 | df["jsc"] = df["jsc"].round(2) 36 | df["voc"] = df["voc"].round(2) 37 | df["bandgap"] = df["bandgap"].round(2) 38 | 39 | for _i, row in df.iterrows(): 40 | device_stack = ast.literal_eval(row["device_stack"]) 41 | device_stack_string = oxford_comma_join(device_stack) 42 | absorber = row["descriptive_formulas"] 43 | device_stack_string = device_stack_string.replace("Perovskite", absorber) 44 | device_stack_strings.append(device_stack_string) 45 | 46 | df["device_stack_string"] = device_stack_strings 47 | df[ 48 | [ 49 | "pce", 50 | "ff", 51 | "jsc", 52 | "voc", 53 | "bandgap", 54 | "reduced_formulas", 55 | "descriptive_formulas", 56 | "iupac_formulas", 57 | "device_stack_string", 58 | ] 59 | ].to_csv("data_clean.csv", index=False) 60 | print(len(df)) 61 | 62 | 63 | if __name__ == "__main__": 64 | preprocess() 65 | -------------------------------------------------------------------------------- /data/tabular/physics_stackexchange/meta.yaml: -------------------------------------------------------------------------------- 1 | name: physics_stackexchange 2 | description: |- 3 | Questions and answers mined from physics.stackexchange.com. 4 | targets: 5 | - id: a 6 | description: answer to the question 7 | type: string 8 | - id: title 9 | description: title of the question 10 | type: string 11 | identifiers: 12 | - id: q 13 | type: string 14 | description: question asked on physics.stackexchange.com 15 | license: CC BY-SA 16 | links: 17 | - url: physics.stackexchange.com 18 | description: original data source 19 | - url: https://stackoverflow.com/help/licensing 20 | description: information about the license 21 | num_points: 6732 22 | templates: 23 | - |- 24 | {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!} 25 | {#User: |Question: |Inquiry: |\n!}{#q} 26 | {#Assistant: |Answer: !}{#a} 27 | - |- 28 | {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!} 29 | {#Question: |Inquiry: |\n!}{#q} 30 | {#Assistant: |Title: |Answer: |!}{#title} 31 | -------------------------------------------------------------------------------- /data/tabular/qm8/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def process(): 5 | df = pd.read_json( 6 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-qm8/resolve/main/qm8.json" 7 | ) 8 | 9 | df = df.replace("RDKit", "ChemNLP", regex=True) 10 | df.dropna(inplace=True) 11 | df = df.rename(columns={"smiles": "SMILES"}) 12 | df = df.query("is_longer_than_allowed==False") 13 | columns = [ 14 | "E1-CC2", 15 | "E2-CC2", 16 | "f1-CC2", 17 | "f2-CC2", 18 | "E1-PBE0", 19 | "E2-PBE0", 20 | "f1-PBE0", 21 | "f2-PBE0", 22 | "E1-CAM", 23 | "E2-CAM", 24 | "f1-CAM", 25 | "f2-CAM", 26 | ] 27 | # filter out rows in which one of the columns is not a float. Filter explicitly for the row in which 28 | # the values for all those columns are floats. 29 | df = df[df[columns].apply(lambda x: x.apply(lambda y: isinstance(y, float))).all(1)] 30 | df[columns] = df[columns].astype(float) 31 | print(len(df)) 32 | df.to_csv("data_clean.csv", index=False) 33 | 34 | 35 | if __name__ == "__main__": 36 | process() 37 | -------------------------------------------------------------------------------- /data/tabular/qm9/transform.py: -------------------------------------------------------------------------------- 1 | import datasets 2 | import pandas as pd 3 | 4 | DATASET_NAME = "qm9" 5 | 6 | 7 | def prepare_data(): 8 | dataset_name = "n0w0f/qm9-csv" 9 | split_name = "train" # data without any split @ hf 10 | filename_to_save = "data_clean.csv" 11 | 12 | # Load the dataset from Hugging Face 13 | dataset = datasets.load_dataset(dataset_name, split=split_name) 14 | 15 | df = pd.DataFrame(dataset) 16 | 17 | # assert column names 18 | fields_orig = df.columns.tolist() 19 | assert fields_orig == [ 20 | "inchi", 21 | "smiles", 22 | "rotational_constant_a", 23 | "rotational_constant_b", 24 | "rotational_constant_c", 25 | "dipole_moment", 26 | "polarizability", 27 | "homo", 28 | "lumo", 29 | "gap", 30 | "r2", 31 | "zero_point_energy", 32 | "u0", 33 | "u298", 34 | "h298", 35 | "g298", 36 | "heat_capacity", 37 | ] 38 | assert not df.duplicated().sum() 39 | 40 | # remove duplicates if any 41 | df = df.drop_duplicates() 42 | 43 | datapoints = len(df) 44 | # some parts of the code assume that "SMILES" is in upper case, rename this column 45 | df.rename(columns={"smiles": "SMILES"}, inplace=True) 46 | df = df.replace("RDKit", "ChemNLP", regex=True) 47 | df.to_csv(filename_to_save, index=False) 48 | return datapoints 49 | 50 | 51 | if __name__ == "__main__": 52 | print(f" Preparing clean tabular {DATASET_NAME} datatset") 53 | datapoints = prepare_data() 54 | print( 55 | f" Finished Preparing clean tabular {DATASET_NAME} datatset with {datapoints} datapoints" 56 | ) 57 | -------------------------------------------------------------------------------- /data/tabular/qmof_quantum/transform.py: -------------------------------------------------------------------------------- 1 | import ast 2 | 3 | import pandas as pd 4 | 5 | 6 | def process(): 7 | df = pd.read_json( 8 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-qmof-data/resolve/main/qmof_data.json" 9 | ) 10 | 11 | df.dropna( 12 | subset=[ 13 | "outputs.pbe.bandgap", 14 | "outputs.pbe.cbm", 15 | "outputs.pbe.vbm", 16 | "outputs.hle17.bandgap", 17 | "outputs.hle17.cbm", 18 | "outputs.hle17.vbm", 19 | "outputs.hse06.bandgap", 20 | "outputs.hse06.cbm", 21 | "outputs.hse06.vbm", 22 | "info.pld", 23 | "info.lcd", 24 | "info.density", 25 | "info.mofid.mofid", 26 | "info.mofid.smiles_nodes", 27 | "info.mofid.smiles_linkers", 28 | "info.mofid.topology", 29 | "info.symmetry.spacegroup_number", 30 | ], 31 | inplace=True, 32 | ) 33 | 34 | df["info.mofid.smiles_nodes"] = df["info.mofid.smiles_nodes"].apply( 35 | lambda x: ", ".join(ast.literal_eval(x)) 36 | ) 37 | 38 | df["info.mofid.smiles_linkers"] = df["info.mofid.smiles_linkers"].apply( 39 | lambda x: ", ".join(ast.literal_eval(x)) 40 | ) 41 | 42 | print(len(df)) 43 | 44 | df.to_csv("data_clean.csv", index=False) 45 | 46 | 47 | if __name__ == "__main__": 48 | process() 49 | -------------------------------------------------------------------------------- /data/tabular/rdkit_features/transform.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | import fire 4 | from datasets import load_dataset 5 | 6 | 7 | def clean_df(df): 8 | df.dropna(inplace=True) 9 | df[ 10 | [ 11 | "NumHDonors", 12 | "NumHAcceptors", 13 | "NumHeteroatoms", 14 | "RingCount", 15 | "NumRotatableBonds", 16 | "NumAromaticBonds", 17 | "NumAcidGroups", 18 | "NumBasicGroups", 19 | ] 20 | ] = df[ 21 | [ 22 | "NumHDonors", 23 | "NumHAcceptors", 24 | "NumHeteroatoms", 25 | "RingCount", 26 | "NumRotatableBonds", 27 | "NumAromaticBonds", 28 | "NumAcidGroups", 29 | "NumBasicGroups", 30 | ] 31 | ].astype( 32 | int 33 | ) 34 | df["MolLogP"] = df["MolLogP"].astype(float) 35 | df["Apol"] = df["Apol"].astype(float) 36 | df.rename(columns={"text": "SMILES"}, inplace=True) 37 | return df 38 | 39 | 40 | def process(): 41 | if not (os.path.isfile("data_clean.csv")): 42 | df = load_dataset( 43 | "maykcaldas/smiles-transformers", split="validation" 44 | ).to_pandas() 45 | df = clean_df(df) 46 | df["split"] = "valid" 47 | df.to_csv("data_clean.csv", index=False) 48 | del df 49 | 50 | df = load_dataset("maykcaldas/smiles-transformers", split="test").to_pandas() 51 | df = clean_df(df) 52 | df["split"] = "test" 53 | df.to_csv("data_clean.csv", index=False, mode="a", header=False) 54 | del df 55 | 56 | splits = [f"train[{k}%:{k+5}%]" for k in range(0, 100, 5)] 57 | for s in splits: 58 | df = load_dataset("maykcaldas/smiles-transformers", split=s).to_pandas() 59 | df = clean_df(df) 60 | df["split"] = "train" 61 | df.to_csv("data_clean.csv", index=False, mode="a", header=False) 62 | else: 63 | print("Reusing present data_clean.csv.") 64 | 65 | 66 | if __name__ == "__main__": 67 | fire.Fire(process) 68 | -------------------------------------------------------------------------------- /data/tabular/rhea_db_masked/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def process(): 5 | df = pd.read_json( 6 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-rhea-db/resolve/main/rhea-reaction-smiles_prompts.json" 7 | ) 8 | df.dropna(subset=["masked_rxn_smiles", "missing_component"], inplace=True) 9 | print(len(df)) 10 | df.to_csv("data_clean.csv", index=False) 11 | 12 | 13 | if __name__ == "__main__": 14 | process() 15 | -------------------------------------------------------------------------------- /data/tabular/rhea_db_predictions/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def oxford_comma_join(elements): 5 | if len(elements) == 1: 6 | return elements[0] 7 | elif len(elements) == 2: 8 | return " and ".join(elements) 9 | else: 10 | return ", ".join(elements[:-1]) + ", and " + elements[-1] 11 | 12 | 13 | def process(): 14 | df = pd.read_json( 15 | "https://huggingface.co/datasets/kjappelbaum/chemnlp-rhea-db/resolve/main/rhea-reaction-smiles_prompts.json" 16 | ) 17 | df["educt_string"] = df["educts"].apply(oxford_comma_join) 18 | df["product_string"] = df["products"].apply(oxford_comma_join) 19 | df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True) 20 | df.dropna(subset=["educt_string", "product_string"], inplace=True) 21 | print(len(df)) 22 | df[["RXNSMILES", "educt_string", "product_string"]].to_csv( 23 | "data_clean.csv", index=False 24 | ) 25 | 26 | 27 | if __name__ == "__main__": 28 | process() 29 | -------------------------------------------------------------------------------- /data/tabular/run_all_transform.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | eval "$(conda shell.bash hook)" 4 | conda activate chemnlp 5 | 6 | for dir in */ 7 | do ( 8 | echo "$dir" 9 | cd "$dir" 10 | python transform.py 11 | ) 12 | done 13 | -------------------------------------------------------------------------------- /data/tabular/sigma_aldrich_safety_data/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def transform_data(): 6 | file = hf_hub_download( 7 | repo_id="chemNLP/msds_sigma_aldrich", 8 | filename="msds.csv", 9 | repo_type="dataset", 10 | ) 11 | 12 | df = pd.read_csv(file) 13 | df = df.drop(columns=["h_statements"]) 14 | df = df.dropna() 15 | df.to_csv("data_clean.csv") 16 | 17 | 18 | if __name__ == "__main__": 19 | transform_data() 20 | -------------------------------------------------------------------------------- /data/tabular/smiles_to_3d/transform.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | from chemnlp.data.convert import is_longer_than_allowed 4 | 5 | 6 | def process(): 7 | dataset = load_dataset("kjappelbaum/chemnlp-qm9-file-translation") 8 | df = dataset["train"].to_pandas() 9 | df.replace(to_replace="RDKit", value="ChemNLP", inplace=True) 10 | df["is_longer_than_allowed"] = df["mol2000"].apply(is_longer_than_allowed) 11 | df = df[~df["is_longer_than_allowed"]] 12 | print(len(df)) 13 | df = df.replace("RDKit", "ChemNLP", regex=True) 14 | df.to_csv("data_clean.csv", index=False) 15 | 16 | 17 | if __name__ == "__main__": 18 | process() 19 | -------------------------------------------------------------------------------- /data/tabular/thermosol/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def process(): 5 | df = pd.read_csv( 6 | "http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/thermosol.csv" 7 | ) 8 | df.rename(columns={"smile": "SMILES"}, inplace=True) 9 | df.dropna(inplace=True) 10 | print(len(df)) 11 | df[["SMILES", "target"]].to_csv("data_clean.csv", index=False) 12 | 13 | 14 | if __name__ == "__main__": 15 | process() 16 | -------------------------------------------------------------------------------- /data/tabular/uniprot_binding_single/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | DATA = "uniprot_binding_sites" 5 | 6 | 7 | def load_dataset() -> pd.DataFrame: 8 | uniprot = hf_hub_download( 9 | repo_id="chemnlp/uniprot", 10 | filename=f"{DATA}/data_clean.csv", 11 | repo_type="dataset", 12 | ) 13 | uniprot = pd.read_csv(uniprot) 14 | uniprot.end_binding_site = uniprot.end_binding_site.astype(int) 15 | uniprot.drop_duplicates( 16 | inplace=True, 17 | ) 18 | uniprot = uniprot[uniprot.end_binding_site == uniprot.start_binding_site] 19 | print(f"Successfully loaded {DATA}! {len(uniprot)} rows") 20 | uniprot.to_csv("data_clean.csv", index=False) 21 | print(f"Successfully loaded {DATA}!") 22 | return uniprot 23 | 24 | 25 | if __name__ == "__main__": 26 | load_dataset() 27 | -------------------------------------------------------------------------------- /data/tabular/uniprot_binding_sites_multiple/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | DATA = "uniprot_binding_sites" 5 | 6 | 7 | def load_dataset() -> pd.DataFrame: 8 | uniprot = hf_hub_download( 9 | repo_id="chemnlp/uniprot", 10 | filename=f"{DATA}/data_clean.csv", 11 | repo_type="dataset", 12 | ) 13 | uniprot = pd.read_csv(uniprot) 14 | uniprot.end_binding_site = uniprot.end_binding_site.astype(int) 15 | uniprot.drop_duplicates( 16 | inplace=True, 17 | ) 18 | uniprot = uniprot[uniprot.end_binding_site > uniprot.start_binding_site] 19 | print(f"Successfully loaded {DATA}! {len(uniprot)} rows") 20 | uniprot.to_csv("data_clean.csv", index=False) 21 | print(f"Successfully loaded {DATA}!") 22 | return uniprot 23 | 24 | 25 | if __name__ == "__main__": 26 | load_dataset() 27 | -------------------------------------------------------------------------------- /data/tabular/uniprot_organisms/meta.yaml: -------------------------------------------------------------------------------- 1 | name: uniprot_organisms 2 | description: |- 3 | Organisms in which a amino-acid sequence can be found. 4 | targets: 5 | - id: organisms 6 | description: organisms in which a protein can be found 7 | type: text 8 | names: 9 | - noun: organisms 10 | identifiers: 11 | - id: other 12 | type: AS_SEQUENCE 13 | description: other 14 | license: MIT 15 | links: 16 | - url: https://www.uniprot.org/ 17 | description: data source 18 | num_points: 559428 19 | bibtex: 20 | - |- 21 | @article{10.1093/nar/gkac1052, 22 | author = {The UniProt Consortium}, 23 | title = {UniProt - the Universal Protein Knowledgebase in 2023}, 24 | journal = {Nucleic Acids Research}, 25 | volume = {51}, 26 | number = {D1}, 27 | pages = {D523-D531}, 28 | year = {2022}, 29 | month = {11}, 30 | issn = {0305-1048}, 31 | doi = {10.1093/nar/gkac1052}, 32 | url = {https://doi.org/10.1093/nar/gkac1052}} 33 | templates: 34 | - |- 35 | The protein with the {#amino acid sequence|AA sequence!} {other#} can be found in {#the organism |!}{organisms#}. 36 | - |- 37 | Task: {#Predict|Identify!} the organism in which {#the below|this!} {#protein|amino acid sequence|AA sequence|polypeptide!} can be found. 38 | {#Amino acid sequence |Sequence|AA sequence!}: {other#} 39 | Result: {organisms#} 40 | - |- 41 | User: In what organism can you find the following {#protein|amino acid sequence|AA sequence|polypeptide!}:\n{other#} 42 | Assistant: The given {#protein|amino acid sequence|AA sequence|polypeptide!} can be found in {organisms#}. 43 | - |- 44 | Task: {#Predict|Identify!} the organism in which {#the below|this!} {#protein|amino acid sequence|AA sequence|polypeptide!} can be found. 45 | {#Amino acid sequence|Sequence|AA sequence!}: {other#} 46 | Result:{organisms#} 47 | -------------------------------------------------------------------------------- /data/tabular/uniprot_organisms/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | DATA = "uniprot_organisms" 5 | 6 | 7 | def load_dataset() -> pd.DataFrame: 8 | uniprot = hf_hub_download( 9 | repo_id="chemnlp/uniprot", 10 | filename=f"{DATA}/data_clean.csv", 11 | repo_type="dataset", 12 | ) 13 | uniprot = pd.read_csv(uniprot) 14 | uniprot.rename(columns={"sequence": "other"}, inplace=True) 15 | uniprot.drop_duplicates( 16 | inplace=True, 17 | ) 18 | print(f"Successfully loaded {DATA}! {len(uniprot)} rows") 19 | uniprot.to_csv("data_clean.csv", index=False) 20 | print(f"Successfully loaded {DATA}!") 21 | return uniprot 22 | 23 | 24 | if __name__ == "__main__": 25 | load_dataset() 26 | -------------------------------------------------------------------------------- /data/tabular/uniprot_reactions/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | DATA = "uniprot_reactions" 5 | 6 | 7 | def load_dataset() -> pd.DataFrame: 8 | uniprot = hf_hub_download( 9 | repo_id="chemnlp/uniprot", 10 | filename=f"{DATA}/data_clean.csv", 11 | repo_type="dataset", 12 | ) 13 | uniprot = pd.read_csv(uniprot) 14 | uniprot.rename(columns={"sequence": "other"}, inplace=True) 15 | uniprot.drop_duplicates( 16 | inplace=True, 17 | ) 18 | print(f"Successfully loaded {DATA}! {len(uniprot)} rows") 19 | uniprot.to_csv("data_clean.csv", index=False) 20 | print(f"Successfully loaded {DATA}!") 21 | return uniprot 22 | 23 | 24 | if __name__ == "__main__": 25 | load_dataset() 26 | -------------------------------------------------------------------------------- /data/tabular/uniprot_sentences/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import regex as re 3 | from huggingface_hub import hf_hub_download 4 | 5 | DATA = "uniprot_sentences" 6 | 7 | 8 | def clean_up_sentences(text: str) -> str: 9 | "Remove (By similarity) from the sentences" 10 | 11 | updated_text = re.sub(r"\s*\((?:By\.? similarity)\)\s*", "", text) 12 | updated_text = updated_text.replace(" . ", ". ") 13 | updated_text = updated_text.replace(" .", ".") 14 | updated_text = updated_text.strip() 15 | if not (updated_text.endswith(".")): 16 | updated_text += "." 17 | return updated_text 18 | 19 | 20 | def load_dataset() -> pd.DataFrame: 21 | uniprot = hf_hub_download( 22 | repo_id="chemnlp/uniprot", 23 | filename=f"{DATA}/data_clean.csv", 24 | repo_type="dataset", 25 | ) 26 | 27 | uniprot = pd.read_csv(uniprot) 28 | uniprot.sentences = uniprot.sentences.apply(clean_up_sentences) 29 | uniprot.drop_duplicates( 30 | inplace=True, 31 | ) 32 | print(f"Successfully loaded {DATA}! {len(uniprot)} rows") 33 | uniprot.to_csv("data_clean.csv", index=False) 34 | print(f"Successfully loaded {DATA}!") 35 | return uniprot 36 | 37 | 38 | if __name__ == "__main__": 39 | load_dataset() 40 | -------------------------------------------------------------------------------- /data/tabular/uspto/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def oxford_comma_join(elements): 6 | try: 7 | if len(elements) == 1: 8 | return elements[0] 9 | elif len(elements) == 2: 10 | return " and ".join(elements) 11 | else: 12 | return ", ".join(elements[:-1]) + ", and " + elements[-1] 13 | except Exception: 14 | return None 15 | 16 | 17 | def process(): 18 | file_train = hf_hub_download( 19 | repo_id="kjappelbaum/chemnlp-uspto", 20 | filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json", 21 | repo_type="dataset", 22 | ) 23 | df_train = pd.read_json(file_train) 24 | df_train["split"] = "train" 25 | 26 | file_test = hf_hub_download( 27 | repo_id="kjappelbaum/chemnlp-uspto", 28 | filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json", 29 | repo_type="dataset", 30 | ) 31 | df_test = pd.read_json(file_test) 32 | df_test["split"] = "test" 33 | 34 | file_valid = hf_hub_download( 35 | repo_id="kjappelbaum/chemnlp-uspto", 36 | filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json", 37 | repo_type="dataset", 38 | ) 39 | 40 | df_valid = pd.read_json(file_valid) 41 | df_valid["split"] = "valid" 42 | 43 | df = pd.concat([df_train, df_test, df_valid]) 44 | 45 | df["educt_string"] = df["educts"].apply(oxford_comma_join) 46 | df["product_string"] = df["products"].apply(oxford_comma_join) 47 | df["RXNSMILES"] = df["canonical_rxn_smiles"] 48 | 49 | print(len(df)) 50 | df.to_csv("data_clean.csv", index=False) 51 | 52 | 53 | if __name__ == "__main__": 54 | process() 55 | -------------------------------------------------------------------------------- /data/tabular/uspto_yield/transform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from huggingface_hub import hf_hub_download 3 | 4 | 5 | def oxford_comma_join(elements): 6 | try: 7 | if len(elements) == 1: 8 | return elements[0] 9 | elif len(elements) == 2: 10 | return " and ".join(elements) 11 | else: 12 | return ", ".join(elements[:-1]) + ", and " + elements[-1] 13 | except Exception: 14 | return None 15 | 16 | 17 | def process(): 18 | file_train = hf_hub_download( 19 | repo_id="kjappelbaum/chemnlp-uspto", 20 | filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json", 21 | repo_type="dataset", 22 | ) 23 | df_train = pd.read_json(file_train) 24 | df_train["split"] = "train" 25 | 26 | file_test = hf_hub_download( 27 | repo_id="kjappelbaum/chemnlp-uspto", 28 | filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json", 29 | repo_type="dataset", 30 | ) 31 | df_test = pd.read_json(file_test) 32 | df_test["split"] = "test" 33 | 34 | file_valid = hf_hub_download( 35 | repo_id="kjappelbaum/chemnlp-uspto", 36 | filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json", 37 | repo_type="dataset", 38 | ) 39 | 40 | df_valid = pd.read_json(file_valid) 41 | df_valid["split"] = "valid" 42 | 43 | df = pd.concat([df_train, df_test, df_valid]) 44 | df = df.query("WithinTolerance == True") 45 | df["yield"] = df["MeanYield"] 46 | df["educt_string"] = df["educts"].apply(oxford_comma_join) 47 | df["product_string"] = df["products"].apply(oxford_comma_join) 48 | df["RXNSMILES"] = df["canonical_rxn_smiles"] 49 | print(len(df)) 50 | df.to_csv("data_clean.csv", index=False) 51 | 52 | 53 | if __name__ == "__main__": 54 | process() 55 | -------------------------------------------------------------------------------- /data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al/meta.yaml: -------------------------------------------------------------------------------- 1 | name: volume_of_distribution_at_steady_state_lombardo_et_al 2 | description: |- 3 | The volume of distribution at steady state (VDss) measures the degree 4 | of a drug's concentration in the body tissue compared to concentration in the blood. 5 | Higher VD indicates a higher distribution in the tissue and usually indicates 6 | the drug with high lipid solubility, low plasma protein binding rate. 7 | targets: 8 | - id: VDss_Lombardo 9 | description: volume of distribution at steady state (VDss) 10 | units: L/kg 11 | type: continuous 12 | names: 13 | - noun: volume of distribution at steady state (VDss) 14 | - noun: VDss 15 | uris: 16 | - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C85538 17 | benchmarks: 18 | - name: TDC 19 | link: https://tdcommons.ai/ 20 | split_column: split 21 | identifiers: 22 | - id: SMILES 23 | type: SMILES 24 | description: SMILES 25 | - id: compound_name 26 | type: Other 27 | names: 28 | - noun: compound name 29 | - noun: drug name 30 | - noun: generic drug name 31 | description: mix of drug name and ids 32 | license: CC BY 4.0 33 | links: 34 | - url: https://doi.org/10.1021/acs.jcim.6b00044 35 | description: corresponding publication 36 | - url: https://tdcommons.ai/single_pred_tasks/adme/#vdss-volumn-of-distribution-at-steady-state-lombardo-et-al 37 | description: data source 38 | num_points: 1130 39 | bibtex: 40 | - |- 41 | @article{Lombardo2016, 42 | doi = {10.1021/acs.jcim.6b00044}, 43 | url = {https://doi.org/10.1021/acs.jcim.6b00044}, 44 | year = {2016}, 45 | month = sep, 46 | publisher = {merican Chemical Society (ACS)}, 47 | volume = {56}, 48 | number = {10}, 49 | pages = {2042--2052}, 50 | author = {Franco Lombardo and Yankang Jing}, 51 | title = {In Silico Prediction of Volume of Distribution in Humans. Extensive Data Set and the 52 | Exploration of Linear and Nonlinear Methods Coupled with Molecular Interaction Fields Descriptors}, 53 | journal = {Journal of Chemical Information and Modeling} 54 | -------------------------------------------------------------------------------- /data/tabular/zinc/meta.yaml: -------------------------------------------------------------------------------- 1 | name: zinc 2 | description: |- 3 | ZINC is a free database of commercially-available compounds for virtual screening. 4 | It contains over 230 million purchasable compounds in ready-to-dock, 3D formats. 5 | TDC uses a 250,000 sampled version from the original Mol-VAE paper. 6 | identifiers: 7 | - id: SMILES 8 | type: SMILES 9 | description: SMILES 10 | license: |- 11 | ZINC is free to use for everyone. 12 | Redistribution of significant subsets requires written permission from the authors. 13 | links: 14 | - url: https://pubs.acs.org/doi/full/10.1021/acs.jcim.5b00559 15 | description: Article about original dataset 16 | - url: https://pubs.acs.org/doi/abs/10.1021/acscentsci.7b00572 17 | description: Exemplary related article shown in tdc's website 18 | num_points: 249455 19 | bibtex: 20 | - |- 21 | @article{doi:10.1021/acs.jcim.5b00559, 22 | author = {Sterling, Teague and Irwin, John J.}, 23 | title = {ZINC 15 - Ligand Discovery for Everyone}, 24 | journal = {Journal of Chemical Information and Modeling}, 25 | volume = {55}, 26 | number = {11}, 27 | pages = {2324-2337}, 28 | year = {2015}, 29 | doi = {10.1021/acs.jcim.5b00559}, 30 | note ={PMID: 26479676}, 31 | URL = {https://doi.org/10.1021/acs.jcim.5b00559}, 32 | eprint = {https://doi.org/10.1021/acs.jcim.5b00559}, 33 | } 34 | -------------------------------------------------------------------------------- /data/text_sampling/get_dataset_overlap.py: -------------------------------------------------------------------------------- 1 | import glob 2 | 3 | import pandas as pd 4 | 5 | skip_ds = [ 6 | "rdkit_features", 7 | "iupac_smiles", 8 | "orbnet_denali", 9 | "qmof_gcmc", 10 | "qmof_quantum", 11 | "zinc", 12 | ] 13 | 14 | if __name__ == "__main__": 15 | path_base = __file__.replace("text_sampling/get_dataset_overlap.py", "") 16 | fns = sorted(glob.glob(path_base + "tabular/**/data_clean.csv")) 17 | for i in range(len(fns)): 18 | for j in range(i + 1, len(fns)): 19 | fn1 = fns[i] 20 | fn2 = fns[j] 21 | ds1 = fn1.split("/")[-2] 22 | ds2 = fn2.split("/")[-2] 23 | if (ds1 in skip_ds) or (ds2 in skip_ds): 24 | continue 25 | df1 = pd.read_csv( 26 | fn1, index_col=False, low_memory=False, nrows=0 27 | ) # only get columns 28 | df2 = pd.read_csv( 29 | fn2, index_col=False, low_memory=False, nrows=0 30 | ) # only get columns 31 | if ("SMILES" in df1.columns) and ("SMILES" in df2.columns): 32 | df1 = pd.read_csv( 33 | fn1, index_col=False, low_memory=False, usecols=["SMILES"] 34 | ) 35 | df2 = pd.read_csv( 36 | fn2, index_col=False, low_memory=False, usecols=["SMILES"] 37 | ) 38 | print( 39 | fn1.split("/")[-2], 40 | fn2.split("/")[-2], 41 | len(set(df1.SMILES) & set(df2.SMILES)), 42 | ) 43 | -------------------------------------------------------------------------------- /data/text_sampling/utils.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | 4 | def str_presenter(dumper, data: dict): 5 | """configures yaml for dumping multiline strings 6 | Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data 7 | """ 8 | if data.count("\n") > 0: # check for multiline string 9 | return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") 10 | return dumper.represent_scalar("tag:yaml.org,2002:str", data) 11 | 12 | 13 | def load_yaml(path: str) -> dict: 14 | """Load yaml file from path.""" 15 | with open(path, "r") as stream: 16 | try: 17 | data = yaml.safe_load(stream) 18 | except yaml.YAMLError as exc: 19 | raise Exception(exc) 20 | return data 21 | -------------------------------------------------------------------------------- /docs/api/meta_yaml_generator.md: -------------------------------------------------------------------------------- 1 | # Meta YAML Generator 2 | 3 | ## Overview 4 | 5 | The Meta YAML Generator is a tool designed to automatically create a `meta.yaml` file for chemical datasets using Large Language Models (LLMs). It analyzes the structure of a given DataFrame and generates a comprehensive metadata file, including advanced sampling methods and template formats. 6 | 7 | The model used by default is `gpt4o`. For using it, you need to expose the `OPENAI_API_KEY` environment variable. 8 | 9 | ## `generate_meta_yaml` 10 | 11 | ::: chemnlp.data.meta_yaml_generator.generate_meta_yaml 12 | handler: python 13 | options: 14 | show_root_heading: true 15 | show_source: false 16 | 17 | ## Usage Example 18 | 19 | ```python 20 | import pandas as pd 21 | from chemnlp.data.meta_yaml_generator import generate_meta_yaml 22 | 23 | # Load your dataset 24 | df = pd.read_csv("your_dataset.csv") 25 | 26 | # Generate meta.yaml 27 | meta_yaml = generate_meta_yaml( 28 | df, 29 | dataset_name="Polymer Properties Dataset", 30 | description="A dataset of polymer properties including glass transition temperatures and densities", 31 | output_path="path/to/save/meta.yaml" 32 | ) 33 | 34 | # The meta_yaml variable now contains the dictionary representation of the meta.yaml 35 | # If an output_path was provided, the meta.yaml file has been saved to that location 36 | ``` 37 | 38 | You can also use it as a command-line tool: 39 | 40 | ```bash 41 | python -m chemnlp.data.meta_yaml_generator path/to/your_dataset.csv --dataset_name "Polymer Properties Dataset" --description "A dataset of polymer properties including glass transition temperatures and densities" --output_path "path/to/save/meta.yaml" 42 | ``` 43 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # ChemNLP 2 | 3 | ChemNLP is an effort to create the largest dataset of chemical data. 4 | We then use this dataset to train large language models (LLMs). 5 | -------------------------------------------------------------------------------- /experiments/ablations/20240814_sample_data.bash: -------------------------------------------------------------------------------- 1 | # Without wrappers 2 | # Benchmarking 3 | chemnlp-sample data/tabular/lipophilicity sampled_benchmark/ --benchmarking --class_balanced 4 | chemnlp-sample data/tabular/bicerano_dataset sampled_benchmark/ --benchmarking --class_balanced 5 | chemnlp-sample data/tabular/opv sampled_benchmark/ --benchmarking --class_balanced 6 | chemnlp-sample data/tabular/melting_points sampled_benchmark/ --benchmarking --class_balanced 7 | chemnlp-sample data/tabular/bc5disease sampled_benchmark/ --benchmarking --class_balanced 8 | chemnlp-sample data/tabular/MUV_846 sampled_benchmark/ --benchmarking --class_balanced 9 | 10 | # Default 11 | chemnlp-sample data/tabular/lipophilicity/ sampled --class_balanced 12 | chemnlp-sample data/tabular/bicerano_dataset/ sampled --class_balanced 13 | chemnlp-sample data/tabular/opv/ sampled --class_balanced 14 | chemnlp-sample data/tabular/melting_points/ sampled --class_balanced 15 | chemnlp-sample data/tabular/bc5disease/ sampled --class_balanced 16 | chemnlp-sample data/tabular/MUV_846/ sampled --class_balanced 17 | 18 | 19 | # With wrappers 20 | # Benchmarking 21 | chemnlp-sample data/tabular/lipophilicity/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced 22 | chemnlp-sample data/tabular/bicerano_dataset/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced 23 | chemnlp-sample data/tabular/opv/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced 24 | chemnlp-sample data/tabular/melting_points/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced 25 | chemnlp-sample data/tabular/bc5disease/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced 26 | chemnlp-sample data/tabular/MUV_846/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced 27 | 28 | # Default 29 | chemnlp-sample data/tabular/lipophilicity/ sampled_wrapped --wrap-identifiers --class_balanced 30 | chemnlp-sample data/tabular/bicerano_dataset/ sampled_wrapped --wrap-identifiers --class_balanced 31 | chemnlp-sample data/tabular/opv/ sampled_wrapped --wrap-identifiers --class_balanced 32 | chemnlp-sample data/tabular/melting_points/ sampled_wrapped --wrap-identifiers --class_balanced 33 | chemnlp-sample data/tabular/bc5disease/ sampled_wrapped --wrap-identifiers --class_balanced 34 | chemnlp-sample data/tabular/MUV_846/ sampled_wrapped --wrap-identifiers --class_balanced 35 | -------------------------------------------------------------------------------- /experiments/configs/data_configs/data_mixing.yml: -------------------------------------------------------------------------------- 1 | data_paths: ["/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/hendrycks_STEM", "/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv"] 2 | num_tokens: [2826240, 28262400] 3 | context_length: 2048 4 | save_path: "/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/2826240hendrycks_28262400chemrxiv" 5 | -------------------------------------------------------------------------------- /experiments/configs/data_configs/hf_data.yml: -------------------------------------------------------------------------------- 1 | model_name: "EleutherAI/pythia-1b" 2 | context_length: 2048 3 | dataset_name: "EleutherAI/pile" 4 | dataset_args: {"name": "pubmed", "split": "train"} 5 | batch_size: 1 6 | string_key: "text" 7 | save_path: "/fsx/proj-chemnlp/data/example_tokenised" 8 | -------------------------------------------------------------------------------- /experiments/configs/data_configs/hf_data_wiki.yml: -------------------------------------------------------------------------------- 1 | model_name: "EleutherAI/pythia-1b" 2 | context_length: 2048 3 | dataset_name: "wikipedia" 4 | dataset_args: {"name": "20220301.en", "split": "train", "beam_runner": "DirectRunner"} 5 | batch_size: 1000 6 | out_dir: "/fsx/proj-chemnlp/data" 7 | string_key: "text" 8 | -------------------------------------------------------------------------------- /experiments/configs/data_configs/prep_lm_eval_data.yml: -------------------------------------------------------------------------------- 1 | model_name: "EleutherAI/pythia-1b" 2 | context_length: 2048 3 | tasks: ["hendrycksTest-college_biology", "hendrycksTest-college_chemistry", "hendrycksTest-college_mathematics", "hendrycksTest-college_physics", "hendrycksTest-high_school_mathematics", "hendrycksTest-high_school_biology", "hendrycksTest-high_school_chemistry", "hendrycksTest-high_school_physics"] 4 | data_split: "validation" 5 | out_dir: "/fsx/proj-chemnlp/data" 6 | save_name: "hendrycks_STEM_2" 7 | -------------------------------------------------------------------------------- /experiments/configs/data_configs/prep_smiles_data.yml: -------------------------------------------------------------------------------- 1 | model_name: "EleutherAI/pythia-1b" 2 | context_length: 2048 3 | data_split: "train" 4 | out_dir: "/fsx/proj-chemnlp/data" 5 | save_name: "coconut_smiles" 6 | -------------------------------------------------------------------------------- /experiments/configs/deepspeed/deepspeed_S1.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto" 4 | }, 5 | "bf16": { 6 | "enabled": true 7 | }, 8 | "zero_optimization": { 9 | "stage": 1, 10 | "overlap_comm": true, 11 | "contiguous_gradients": true, 12 | "sub_group_size": 1e9, 13 | "reduce_bucket_size": "auto" 14 | }, 15 | "gradient_accumulation_steps": "auto", 16 | "gradient_clipping": "auto", 17 | "train_batch_size": "auto", 18 | "train_micro_batch_size_per_gpu": "auto" 19 | } 20 | -------------------------------------------------------------------------------- /experiments/configs/deepspeed/deepspeed_S2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto" 4 | }, 5 | "bf16": { 6 | "enabled": true 7 | }, 8 | "zero_optimization": { 9 | "stage": 2, 10 | "offload_optimizer": { 11 | "device": "none", 12 | "pin_memory": true 13 | }, 14 | "overlap_comm": true, 15 | "contiguous_gradients": true, 16 | "sub_group_size": 1e9, 17 | "reduce_bucket_size": "auto" 18 | }, 19 | "gradient_accumulation_steps": "auto", 20 | "gradient_clipping": "auto", 21 | "train_batch_size": "auto", 22 | "train_micro_batch_size_per_gpu": "auto" 23 | } 24 | -------------------------------------------------------------------------------- /experiments/configs/deepspeed/deepspeed_offload_S2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto" 4 | }, 5 | "bf16": { 6 | "enabled": true 7 | }, 8 | "zero_optimization": { 9 | "stage": 2, 10 | "offload_optimizer": { 11 | "device": "cpu", 12 | "pin_memory": true 13 | }, 14 | "overlap_comm": true, 15 | "contiguous_gradients": true, 16 | "sub_group_size": 1e9, 17 | "reduce_bucket_size": "auto" 18 | }, 19 | "gradient_accumulation_steps": "auto", 20 | "gradient_clipping": "auto", 21 | "train_batch_size": "auto", 22 | "train_micro_batch_size_per_gpu": "auto" 23 | } 24 | -------------------------------------------------------------------------------- /experiments/configs/deepspeed/deepspeed_offload_S3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto" 4 | }, 5 | "bf16": { 6 | "enabled": true 7 | }, 8 | "zero_optimization": { 9 | "stage": 3, 10 | "offload_optimizer": { 11 | "device": "cpu", 12 | "pin_memory": true 13 | }, 14 | "offload_param": { 15 | "device": "cpu", 16 | "pin_memory": true 17 | }, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | }, 28 | "gradient_accumulation_steps": "auto", 29 | "gradient_clipping": "auto", 30 | "train_batch_size": "auto", 31 | "train_micro_batch_size_per_gpu": "auto" 32 | } 33 | -------------------------------------------------------------------------------- /experiments/configs/eval_configs/default_eval_config.yaml: -------------------------------------------------------------------------------- 1 | model: hf-causal 2 | model_args: "pretrained=/fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_1b/checkpoint-final" 3 | # model_args: "pretrained=EleutherAI/pythia-1b" 4 | tasks: "hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_mathematics,hendrycksTest-college_physics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_physics" 5 | batch_size: 12 6 | device: "cuda:0" 7 | wandb_log: true 8 | wandb_project: LLCheM 9 | wandb_group: evaluation 10 | wandb_run_name: 1B_fullfinetune_STEM 11 | wandb_entity: chemnlp 12 | -------------------------------------------------------------------------------- /experiments/configs/eval_configs/nlp_eval_config.yaml: -------------------------------------------------------------------------------- 1 | model: hf-causal 2 | model_args: "pretrained=/fsx/path/checkpoint" # update 3 | tasks: "lambada_standard" 4 | num_fewshot: 0 5 | batch_size: 12 6 | device: "cuda:0" 7 | wandb_log: true 8 | wandb_project: LLCheM 9 | wandb_group: evaluation # update 10 | wandb_run_name: 1B_fullfinetune # update 11 | wandb_entity: chemnlp 12 | -------------------------------------------------------------------------------- /experiments/configs/eval_configs/safety_eval_config.yaml: -------------------------------------------------------------------------------- 1 | model: hf-causal 2 | model_args: "pretrained=/fsx/path/checkpoint" # update 3 | tasks: "crows_pairs_english_race_color,crows_pairs_english_socioeconomic,crows_pairs_english_gender,crows_pairs_english_age,crows_pairs_english_religion,crows_pairs_english_disability,crows_pairs_english_sexual_orientation,crows_pairs_english_nationality,crows_pairs_english_physical_appearance" 4 | num_fewshot: 0 5 | batch_size: 12 6 | device: "cuda:0" 7 | wandb_log: true 8 | wandb_project: LLCheM 9 | wandb_group: evaluation # update 10 | wandb_run_name: 1B_fullfinetune # update 11 | wandb_entity: chemnlp 12 | -------------------------------------------------------------------------------- /experiments/configs/eval_configs/stem_eval_config.yaml: -------------------------------------------------------------------------------- 1 | model: hf-causal 2 | model_args: "pretrained=/fsx/path/checkpoint" # update 3 | tasks: "pile_pubmed-abstracts,pile_pubmed-central,headqa_en,sciq,pubmedqa,is_smiles,complete_smiles,periodic_table,openbookqa,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_mathematics,hendrycksTest-college_physics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_physics" 4 | num_fewshot: 0 5 | batch_size: 12 6 | device: "cuda:0" 7 | wandb_log: true 8 | wandb_project: LLCheM 9 | wandb_group: evaluation # update 10 | wandb_run_name: 1B_fullfinetune # update 11 | wandb_entity: chemnlp 12 | -------------------------------------------------------------------------------- /experiments/configs/gpt-neox/cluster_setup.yml: -------------------------------------------------------------------------------- 1 | # Suggested data paths when using GPT-NeoX locally 2 | { 3 | # see example configs for sampling options 4 | "data-path": "/fsx/proj-chemnlp/data/marianna13/chemrxiv/data_text_document", "save": "/fsx/proj-chemnlp/experiments/checkpoints/finetuned/pythia-160M", "load": "/fsx/proj-chemnlp/experiments/checkpoints/pretrained/pythia-160M", "finetune": True, "checkpoint_validation_with_forward_pass": False, "log-dir": "/fsx/proj-chemnlp/experiments/logs", "log_interval": 100, "log_grad_pct_zeros": False, "log_param_norm": False, "log_grad_norm": False, "use_wandb": True, "wandb_host": "https://stability.wandb.io", "wandb_project": "LLCheM", "wandb_group": "Test Runs", "hostfile": "/mock_path", "num_gpus": 1} 5 | -------------------------------------------------------------------------------- /experiments/configs/gpt-neox/soft_prompt.yml: -------------------------------------------------------------------------------- 1 | { 2 | # peft method settings 3 | "soft_prompt_tuning": {"enabled": True, # also freezes all other parameters 4 | "n_tokens": 10, "init_string": "", "init_range": 0.5}} 5 | -------------------------------------------------------------------------------- /experiments/configs/hugging-face/160M_full.yml: -------------------------------------------------------------------------------- 1 | # Dataset configuration (datasets.load_from_disk arguments) 2 | data: 3 | path: /fsx/proj-chemnlp/data/EleutherAI/pythia-160m/marianna13/chemrxiv 4 | # Model configuration (model.from_pretrained arguments) 5 | model: 6 | base: GPTNeoXForCausalLM 7 | name: EleutherAI/pythia-160m 8 | revision: main # latest model 9 | #checkpoint_path: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_160M/checkpoint-1600 10 | # Training strategies (PromptTuningConfig arguments) 11 | prompt_tuning: 12 | enabled: false 13 | # Training configuration (TrainerArguments from HF) 14 | trainer: 15 | output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_160M 16 | num_train_epochs: 1 17 | learning_rate: 3e-4 18 | evaluation_strategy: steps 19 | logging_steps: 50 20 | eval_steps: 500 21 | save_steps: 1000 22 | dataloader_num_workers: 4 23 | bf16: true 24 | fp16: false 25 | per_device_train_batch_size: 4 26 | per_device_eval_batch_size: 4 27 | # Logging configuration (WandB init arguments) 28 | wandb: 29 | enabled: true 30 | project: LLCheM 31 | group: test 32 | name: test_160M_full_v2 # full_160M_v1 33 | entity: chemnlp 34 | -------------------------------------------------------------------------------- /experiments/configs/hugging-face/160M_ptune.yml: -------------------------------------------------------------------------------- 1 | # Dataset configuration (datasets.load_from_disk arguments) 2 | data: 3 | path: /fsx/proj-chemnlp/data/EleutherAI/pythia-160m/marianna13/chemrxiv 4 | # Model configuration (model.from_pretrained arguments) 5 | model: 6 | base: GPTNeoXForCausalLM 7 | name: EleutherAI/pythia-160m 8 | revision: main # latest model 9 | # Training strategies (PromptTuningConfig arguments) 10 | prompt_tuning: 11 | enabled: true 12 | num_virtual_tokens: 10 13 | prompt_tuning_init_text: " " 14 | # Training configuration (TrainerArguments from HF) 15 | trainer: 16 | output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/160M 17 | num_train_epochs: 1 18 | learning_rate: 3e-4 19 | evaluation_strategy: steps 20 | logging_steps: 5 21 | eval_steps: 50 22 | save_steps: 200 23 | dataloader_num_workers: 4 24 | bf16: true 25 | fp16: false 26 | per_device_train_batch_size: 30 27 | per_device_eval_batch_size: 30 28 | # Logging configuration (WandB init arguments) 29 | wandb: 30 | enabled: true 31 | project: LLCheM 32 | group: test 33 | name: peft_160M_v1 34 | -------------------------------------------------------------------------------- /experiments/configs/hugging-face/1B_fine_tune.yml: -------------------------------------------------------------------------------- 1 | # Dataset configuration (datasets.load_from_disk arguments) 2 | data: 3 | path: /fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv 4 | # Model configuration (model.from_pretrained arguments) 5 | model: 6 | base: GPTNeoXForCausalLM 7 | name: EleutherAI/pythia-1b 8 | revision: main # latest model 9 | # Training strategies (PromptTuningConfig arguments) 10 | prompt_tuning: 11 | enabled: false 12 | # Training configuration (TrainerArguments from HF) 13 | trainer: 14 | output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_1b 15 | num_train_epochs: 1 16 | learning_rate: 3e-4 17 | evaluation_strategy: steps 18 | logging_steps: 50 19 | eval_steps: 500 20 | save_steps: 1000 21 | dataloader_num_workers: 4 22 | bf16: true 23 | fp16: false 24 | per_device_train_batch_size: 2 25 | per_device_eval_batch_size: 8 26 | # Logging configuration (WandB init arguments) 27 | wandb: 28 | enabled: true 29 | project: LLCheM 30 | group: test 31 | name: test_1b_fine_tune 32 | -------------------------------------------------------------------------------- /experiments/configs/hugging-face/3B_fine_tune.yml: -------------------------------------------------------------------------------- 1 | # Dataset configuration (datasets.load_from_disk arguments) 2 | data: 3 | path: /fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv 4 | # Model configuration (model.from_pretrained arguments) 5 | model: 6 | base: GPTNeoXForCausalLM 7 | name: EleutherAI/pythia-2.8b 8 | revision: main # latest model 9 | # Training strategies (PromptTuningConfig arguments) 10 | prompt_tuning: 11 | enabled: false 12 | # Training configuration (TrainerArguments from HF) 13 | trainer: 14 | output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_3b 15 | num_train_epochs: 1 16 | learning_rate: 3e-4 17 | evaluation_strategy: steps 18 | logging_steps: 10 19 | eval_steps: 50 20 | save_steps: 500 21 | dataloader_num_workers: 4 22 | bf16: true 23 | fp16: false 24 | per_device_train_batch_size: 8 25 | per_device_eval_batch_size: 1 26 | gradient_checkpointing: True 27 | deepspeed_config: deepspeed_S2.json 28 | # Logging configuration (WandB init arguments) 29 | wandb: 30 | enabled: true 31 | project: LLCheM 32 | group: 3B_deepspeed 33 | name: 3B_fine_tune 34 | -------------------------------------------------------------------------------- /experiments/configs/hugging-face/410M_fine_tune.yml: -------------------------------------------------------------------------------- 1 | # Dataset configuration (datasets.load_from_disk arguments) 2 | data: 3 | path: /fsx/proj-chemnlp/data/EleutherAI/pythia-410m/marianna13/chemrxiv 4 | # Model configuration (model.from_pretrained arguments) 5 | model: 6 | base: GPTNeoXForCausalLM 7 | name: EleutherAI/pythia-410M 8 | revision: main # latest model 9 | # Training strategies (PromptTuningConfig arguments) 10 | prompt_tuning: 11 | enabled: false 12 | # Training configuration (TrainerArguments from HF) 13 | trainer: 14 | output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_410M 15 | num_train_epochs: 1 16 | learning_rate: 3e-4 17 | evaluation_strategy: steps 18 | logging_steps: 50 19 | eval_steps: 500 20 | save_steps: 1000 21 | dataloader_num_workers: 4 22 | bf16: true 23 | fp16: false 24 | per_device_train_batch_size: 2 25 | per_device_eval_batch_size: 2 26 | # Logging configuration (WandB init arguments) 27 | wandb: 28 | enabled: true 29 | project: LLCheM 30 | group: test 31 | name: test_410M_fine_tune_2 32 | entity: chemnlp 33 | -------------------------------------------------------------------------------- /experiments/configs/hugging-face/7B_fine_tune.yml: -------------------------------------------------------------------------------- 1 | # Dataset configuration (datasets.load_from_disk arguments) 2 | data: 3 | path: /fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv 4 | # Model configuration (model.from_pretrained arguments) 5 | model: 6 | base: GPTNeoXForCausalLM 7 | name: EleutherAI/pythia-6.9b 8 | revision: main # latest model 9 | # Training strategies (PromptTuningConfig arguments) 10 | prompt_tuning: 11 | enabled: false 12 | # Training configuration (TrainerArguments from HF) 13 | trainer: 14 | output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_7b_test 15 | num_train_epochs: 1 16 | learning_rate: 3e-4 17 | evaluation_strategy: steps 18 | logging_steps: 10 19 | eval_steps: 50 20 | save_steps: 500 21 | dataloader_num_workers: 4 22 | bf16: true 23 | fp16: false 24 | per_device_train_batch_size: 10 25 | per_device_eval_batch_size: 1 26 | gradient_checkpointing: True 27 | deepspeed_config: deepspeed_offload_S3.json 28 | # Logging configuration (WandB init arguments) 29 | wandb: 30 | enabled: true 31 | project: LLCheM 32 | group: 7B_deepspeed 33 | name: 7B_fine_tune_test 34 | -------------------------------------------------------------------------------- /experiments/data/merge_epmc_to_jsonl.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is specifically for taking a nested folder of jsonlines files 3 | and merging them into one complete jsonlines file. 4 | 5 | e.g. 6 | /2022_05_27/file1.jsonl 7 | /2022_05_25/file2.jsonl 8 | ... 9 | """ 10 | 11 | import multiprocessing 12 | import os 13 | from typing import List 14 | 15 | import jsonlines 16 | from tqdm import tqdm 17 | 18 | # NOTE hardcoded paths 19 | ROOT = "/fsx/proj-chemnlp/jeb/europmc_deduped" 20 | OUT_DIR = "/fsx/proj-chemnlp/jeb/europmc_deduped/merged_paper_and_abstracts" 21 | 22 | 23 | def write_to_merged_file(all_files: List[str], name: str): 24 | """Loops over all_files, reads them and writes them to the out_file""" 25 | out_file = f"{OUT_DIR}/merged_all_{name}.jsonl" 26 | print(f"Writing {len(all_files)} {name} files to {out_file}") 27 | # start context manager for writing 28 | with jsonlines.open(out_file, mode="w") as writer: 29 | for file in tqdm(all_files): 30 | # as writing is serial, this cannot be easily parallelised 31 | with jsonlines.open(file) as reader: 32 | all_entries = [*reader] 33 | writer.write_all(all_entries) 34 | 35 | 36 | if __name__ == "__main__": 37 | # collect all files 38 | result = os.popen(f"find {ROOT} -type f -name '*.jsonl'") 39 | parsed_result = result.read().split("\n") 40 | all_files = [x for x in parsed_result if x] 41 | paper_files = [x for x in all_files if "ft" in x] 42 | abstract_files = [x for x in all_files if "abs" in x] 43 | print( 44 | f"{len(all_files)} files, {len(paper_files)} paper and {len(abstract_files)} abstract files." 45 | ) 46 | 47 | # merge and process 48 | with multiprocessing.Pool(os.cpu_count()) as p: 49 | p.starmap( 50 | write_to_merged_file, 51 | [(paper_files, "papers"), (abstract_files, "abstracts")], 52 | ) 53 | -------------------------------------------------------------------------------- /experiments/data/prepare_gptneox_chemrxiv.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preparing chemrxiv dataset as per GPT-NeoX guidelines 3 | NOTE this needs to be run from the root of this repository directory 4 | 5 | Example usage: 6 | python experiments/chem_data_prep.py /fsx/proj-chemnlp/data/ chemnlp/gpt-neox/ 7 | """ 8 | 9 | import argparse 10 | import os 11 | 12 | import datasets 13 | import jsonlines 14 | 15 | DATASET = "marianna13/chemrxiv" 16 | GPT_NEOX_KEY = "text" 17 | 18 | if __name__ == "__main__": 19 | # parse args 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument( 22 | "save_dir", help="Where you want to store the prepared dataset." 23 | ) 24 | parser.add_argument( 25 | "gptneox_dir", help="Where you can find the GPT-NeoX repository." 26 | ) 27 | args = parser.parse_args() 28 | 29 | # save initial strings from chemrxiv articles as jsonlines 30 | chem_data = datasets.load_dataset(DATASET) 31 | all_full_text_samples = [ 32 | {GPT_NEOX_KEY: paper["TEXT"]} for paper in chem_data["train"] 33 | ] 34 | save_path = f"{args.save_dir}/{DATASET}" 35 | data_path = f"{save_path}/data.jsonl" 36 | os.makedirs(save_path, exist_ok=True) 37 | with jsonlines.open(data_path, "w") as writer: 38 | writer.write_all(all_full_text_samples) 39 | 40 | # execute gpt-neox processing 41 | gpt_tool_path = f"{args.gptneox_dir}/tools/preprocess_data.py" 42 | os.system( 43 | f""" 44 | python {gpt_tool_path} 45 | --input {data_path} 46 | --output-prefix {save_path}/data 47 | --vocab /fsx/pile/20B_tokenizer.json 48 | --dataset-impl mmap 49 | --tokenizer-type HFTokenizer --append-eod 50 | """ 51 | ) 52 | -------------------------------------------------------------------------------- /experiments/data/sbatch_hf_dataset.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="llched-tokenise" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/tokenise_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/tokenise_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ### This script runs tokenisation of Hugging Face datasets 14 | ### The first arg ($1) is the prefix directory where the environment is saved 15 | ### The second arg ($2) is the directory to use when building the environment 16 | ### The third arg ($3) is the full path to the tokenisation configuration file 17 | 18 | set -ex # allow for exiting based on non-0 codes 19 | 20 | # set workdir 21 | cd /fsx/proj-chemnlp/$2/chemnlp 22 | 23 | # create environment 24 | source experiments/scripts/env_creation_hf.sh $1 $2 25 | pip install ".[tokenisation]" 26 | 27 | # trigger run 28 | python experiments/data/prepare_hf_dataset.py $3 29 | -------------------------------------------------------------------------------- /experiments/data/sbatch_hf_split.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="llched-split" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/split_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/split_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ### This script runs tokenisation of Hugging Face datasets 14 | ### The first arg ($1) is the prefix directory where the environment is saved 15 | ### The second arg ($2) is the directory to use when building the environment 16 | ### The third arg ($3) is the path to the raw jsonlines file 17 | ### The fourth arg ($4) is the fractional size of the test / validation sets 18 | 19 | set -ex # allow for exiting based on non-0 codes 20 | 21 | # set workdir 22 | cd /fsx/proj-chemnlp/$2/chemnlp 23 | 24 | # create environment 25 | source experiments/scripts/env_creation_hf.sh $1 $2 26 | 27 | # trigger run 28 | python experiments/data/split_data.py $3 $4 29 | -------------------------------------------------------------------------------- /experiments/data/sbatch_merge_epmc_jsonl.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="merge-epmc" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/data_merge_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/data_merge_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ### The first arg ($1) is the prefix directory where the environment is saved 14 | ### The second arg ($2) is the directory to use when building the environment 15 | 16 | set -ex # allow for exiting based on non-0 codes 17 | 18 | # set workdir 19 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp 20 | 21 | # create environment 22 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2 23 | 24 | # trigger run 25 | cd $CHEMNLP_PATH 26 | python experiments/data/merge_epmc_to_jsonl.py 27 | -------------------------------------------------------------------------------- /experiments/scripts/env_creation_hf.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | ### This script creates a conda environment for chemnlp 3 | ### The first arg ($1) is the prefix directory where the environment is saved 4 | ### The second arg ($2) is the directory to use when building the environment 5 | 6 | ## Must already have miniconda installed! 7 | export CONDA_ENV_PATH=/fsx/proj-chemnlp/$1/conda/env/chemnlp-hf 8 | export PYTHON_VER=3.8 9 | CUDA_VERSION=11.7 10 | CONDA_BASE=$(conda info --base) 11 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp 12 | 13 | ## ensure we can use activate syntax in slurm scripts 14 | source $CONDA_BASE/etc/profile.d/conda.sh 15 | 16 | # Create Python environment through conda 17 | if [ -d "${CONDA_ENV_PATH}" ] 18 | then 19 | # if already exists activate 20 | echo "Found ${CONDA_ENV_PATH} in the directory, activating it!" 21 | conda activate ${CONDA_ENV_PATH} 22 | else 23 | # otherwise create new env (race conditions exist) 24 | echo "Creating ${CONDA_ENV_PATH} environment!" 25 | conda create --force --prefix ${CONDA_ENV_PATH} python=${PYTHON_VER} -y 26 | conda activate ${CONDA_ENV_PATH} 27 | 28 | ## clone + submodules (ok if exists) 29 | cd /fsx/proj-chemnlp/$2 30 | [ ! -d 'chemnlp' ] && git clone --recurse-submodules git@github.com:OpenBioML/chemnlp.git 31 | 32 | ## install core requirements 33 | conda install -y pytorch torchvision torchaudio pytorch-cuda=${CUDA_VERSION} -c pytorch -c nvidia --verbose 34 | cd $CHEMNLP_PATH 35 | pip install ".[training]" 36 | pip install lm-evaluation-harness/ 37 | fi 38 | -------------------------------------------------------------------------------- /experiments/scripts/env_creation_neox.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | ### This script creates a conda environment for chemnlp 3 | ### The first arg ($1) is the prefix directory where the environment is saved 4 | ### The second arg ($2) is the directory to use when building the environment 5 | 6 | ## Must already have miniconda installed! 7 | export CONDA_ENV_PATH=/fsx/proj-chemnlp/$1/conda/env/chemnlp-neox 8 | export PYTHON_VER=3.8 9 | 10 | ## ensure we can use activate syntax in slurm scripts 11 | CONDA_BASE=$(conda info --base) 12 | source $CONDA_BASE/etc/profile.d/conda.sh 13 | 14 | # Create Python environment through conda 15 | if [ -d "${CONDA_ENV_PATH}" ]; then rm -Rf ${CONDA_ENV_PATH}; fi 16 | conda create --force --prefix ${CONDA_ENV_PATH} python=${PYTHON_VER} -y 17 | conda activate ${CONDA_ENV_PATH} 18 | 19 | # Python requirements 20 | ## cd into your directory inside of proj-chemnlp 21 | cd /fsx/proj-chemnlp/$2 22 | 23 | ## clone + submodules (ok if exists) 24 | [ ! -d 'chemnlp' ] && git clone --recurse-submodules git@github.com:OpenBioML/chemnlp.git 25 | 26 | ## install 27 | cd chemnlp/gpt-neox 28 | pip install -r requirements/requirements.txt # base gpt-neox reqs 29 | pip install -r requirements/requirements-wandb.txt # add wand monitoring reqs 30 | 31 | ## downgrades / pins 32 | pip install protobuf=="3.20" 33 | pip install numpy=="1.23" 34 | -------------------------------------------------------------------------------- /experiments/scripts/eval_create_batch_configs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import yaml 5 | from lm_eval import config 6 | 7 | CHECKPOINT_DIR = "checkpoint-final" 8 | 9 | 10 | def run( 11 | config_path: str, 12 | root_models_path: str, 13 | ): 14 | raw_config = config.load_config(config_path) 15 | 16 | model_names = [ 17 | name 18 | for name in os.listdir(root_models_path) 19 | if os.path.isdir(os.path.join(root_models_path, name)) 20 | ] 21 | 22 | for model_name in model_names: 23 | raw_config["model_args"] = ( 24 | f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}" 25 | ) 26 | raw_config["wandb_run_name"] = model_name 27 | 28 | with open( 29 | f"{root_models_path}/{model_name}/eval_config.yml", "w" 30 | ) as new_config: 31 | yaml.dump(raw_config, new_config) 32 | 33 | 34 | if __name__ == "__main__": 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument( 37 | "config_path", help="The full path to the example YAML config file." 38 | ) 39 | parser.add_argument( 40 | "root_models_path", 41 | help="The full path to the parent directory containing models.", 42 | ) 43 | args = parser.parse_args() 44 | run(args.config_path, args.root_models_path) 45 | -------------------------------------------------------------------------------- /experiments/scripts/miniconda_install.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | cd ~ 4 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 5 | bash Miniconda3-latest-Linux-x86_64.sh # Follow instructions, accept all conditions blindly 6 | -------------------------------------------------------------------------------- /experiments/scripts/run_eval.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="chemtest" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/eval_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/eval_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ### This script runs lm_eval2 experiments 14 | ### The first arg ($1) is the prefix directory where the environment is saved 15 | ### The second arg ($2) is the directory to use when building the environment 16 | ### The third arg ($3) is the name of the eval config.yaml file 17 | 18 | set -ex # allow for exiting based on non-0 codes 19 | overrides=${4:-'{}'} 20 | # set workdir 21 | CHEMNLP_PATH=/fsx/proj-chemnlp/$1/chemnlp 22 | 23 | # create environment 24 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2 25 | 26 | export TOKENIZERS_PARALLELISM=false 27 | 28 | # trigger run 29 | cd $CHEMNLP_PATH/lm-evaluation-harness 30 | python main_eval.py $3 --config_overrides $overrides 31 | -------------------------------------------------------------------------------- /experiments/scripts/run_eval_batch.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="chemtest" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/batch_eval_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/batch_eval_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ### This script runs lm_eval2 experiments 14 | ### The first arg ($1) is the prefix directory where the environment is saved 15 | ### The second arg ($2) is the directory to use when building the environment 16 | ### The third arg ($3) is the name of the default eval config.yaml file 17 | ### The fourth arg ($4) is the path to the parent file containing the models to evaluate 18 | 19 | set -ex # allow for exiting based on non-0 codes 20 | 21 | # set workdir 22 | CHEMNLP_PATH=/fsx/proj-chemnlp/$1/chemnlp 23 | 24 | # create environment 25 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2 26 | 27 | # create experiment config for each model 28 | python $CHEMNLP_PATH/experiments/scripts/eval_create_batch_configs.py $3 $4 29 | 30 | # evaluate each model 31 | for entry in $4/*/ 32 | do 33 | sbatch $CHEMNLP_PATH/experiments/scripts/run_eval.sh $1 $2 "$entry"eval_config.yml 34 | sleep 1 35 | done 36 | -------------------------------------------------------------------------------- /experiments/scripts/run_grid_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | 4 | from chemnlp.data_val.config import GridSearch 5 | from chemnlp.utils import _get_all_combinations 6 | 7 | # User-defined parameters 8 | MULTINODE_RUNS = False 9 | SBATCH_SCRIPT = ( 10 | "experiments/scripts/sbatch_train_hf_multinode.sh" 11 | if MULTINODE_RUNS 12 | else "experiments/scripts/sbatch_train_hf.sh" 13 | ) 14 | 15 | WANDB_GRID_GROUPNAME = "test-grid-search-singlenode" 16 | CONDA_ENV = "experiments/training-env" 17 | CHEMNLP_FOLDER = "jack" 18 | 19 | BASE_CONFIGS = ["1B_fine_tune.yml"] # , "3B_fine_tune.yml"] 20 | GRID_PARAMETERS = GridSearch( 21 | data={"path": ["/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv"]}, 22 | trainer={"learning_rate": [3e-4, 3e-3], "lr_scheduler_type": ["linear", "cosine"]}, 23 | ) 24 | 25 | if __name__ == "__main__": 26 | # Job submission loop 27 | for config_path in BASE_CONFIGS: 28 | # for each base configuration 29 | config_name = config_path.split(".")[0] 30 | all_possible_hyperparams = _get_all_combinations(GRID_PARAMETERS.dict()) 31 | 32 | for i, overriding_params in enumerate(all_possible_hyperparams): 33 | # set checkpoint dir & wandb run name 34 | run_name = f"{config_name}_{i}" 35 | overriding_params["wandb"]["name"] = run_name 36 | overriding_params["wandb"]["group"] = WANDB_GRID_GROUPNAME 37 | overriding_params["trainer"][ 38 | "output_dir" 39 | ] = f"/fsx/proj-chemnlp/experiments/checkpoints/finetuned/{WANDB_GRID_GROUPNAME}/{run_name}" 40 | # remove spaces for bash 41 | overriding_json = f"'{json.dumps(overriding_params)}'".replace(" ", "") 42 | 43 | # submit every combination of grid search parameters 44 | cmd = f"sbatch {SBATCH_SCRIPT} {CONDA_ENV} {CHEMNLP_FOLDER} {config_path} {overriding_json}" 45 | subprocess.run(cmd, shell=True) 46 | -------------------------------------------------------------------------------- /experiments/scripts/sbatch_train_hf.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="llchem-singlenode" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/training_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/training_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ### This script runs a GPT-NeoX experiments 14 | ### The first arg ($1) is the prefix directory where the environment is saved 15 | ### The second arg ($2) is the directory to use when building the environment 16 | ### The third arg ($3) is the name of the base training config 17 | ### The fourth arg ($4) is an optional json of any overriding configuration values 18 | 19 | set -ex # allow for exiting based on non-0 codes 20 | export TOKENIZERS_PARALLELISM=false 21 | export WANDB_BASE_URL="https://stability.wandb.io" 22 | overrides=${4:-'{}'} 23 | 24 | # set workdir 25 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp 26 | 27 | # create environment 28 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2 29 | 30 | # trigger run 31 | cd $CHEMNLP_PATH 32 | torchrun --standalone --nnodes 1 --nproc-per-node 8 \ 33 | experiments/scripts/run_tune.py experiments/configs/hugging-face/$3 --config_overrides $overrides 34 | -------------------------------------------------------------------------------- /experiments/scripts/sbatch_train_hf_multinode.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="llchem-multinode" 3 | #SBATCH --nodes=4 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/training_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/training_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ### This script runs a GPT-NeoX experiments 14 | ### The first arg ($1) is the prefix directory where the environment is saved 15 | ### The second arg ($2) is the directory to use when building the environment 16 | ### The third arg ($3) is the name of the base training config 17 | ### The fourth arg ($4) is an optional json of any overriding configuration values 18 | 19 | set -ex # allow for exiting based on non-0 codes 20 | export TOKENIZERS_PARALLELISM=false 21 | export WANDB_BASE_URL="https://stability.wandb.io" 22 | export NCCL_DEBUG=INFO 23 | export NCCL_ASYNC_ERROR_HANDLING=1 24 | export LOGLEVEL=INFO 25 | overrides=${4:-'{}'} 26 | 27 | # set workdir 28 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp 29 | 30 | # create environment 31 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2 32 | 33 | # Get multinode information 34 | nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) ) 35 | nodes_array=($nodes) 36 | head_node=${nodes_array[0]} 37 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) 38 | echo Node IP: $head_node_ip 39 | 40 | # Run script 41 | srun torchrun --nnodes $SLURM_NNODES --nproc_per_node 8 \ 42 | --rdzv_id $RANDOM \ 43 | --rdzv_backend c10d \ 44 | --rdzv_endpoint $head_node_ip:29500 \ 45 | experiments/scripts/run_tune.py experiments/configs/hugging-face/$3 --config_overrides $overrides 46 | -------------------------------------------------------------------------------- /experiments/scripts/transfer_all_checkpoint_to_s3.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="llchem-transfer-batch" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/transfer_batch_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/transfer_batch_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ## This script recursively copies a directory to S3 storage 14 | ### The first arg ($1) is the full path to a folder (i.e. <....>/1B_experiments/) 15 | ### The second arg ($2) is the S3 bucket to copy to (i.e. llchem-models) 16 | ### The third argument is the directory inside proj-chemnlp to find chemnlp 17 | 18 | EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone` 19 | EC2_REGION="`echo \"$EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`" 20 | CHEMNLP_PATH=/fsx/proj-chemnlp/$3/chemnlp 21 | CHECKPOINT_DIR=/fsx/proj-chemnlp/experiments/checkpoints 22 | 23 | echo "Finding checkpoints in $1" 24 | all_checkpoints=( $(find $1 -name "checkpoint-*" -type d) ) 25 | 26 | echo "Saving checkpoints to region: $EC2_REGION" 27 | for chkpt in ${all_checkpoints[@]} 28 | do 29 | sbatch $CHEMNLP_PATH/experiments/scripts/transfer_checkpoint_to_s3.sh $chkpt $2 30 | sleep 1 31 | done 32 | -------------------------------------------------------------------------------- /experiments/scripts/transfer_checkpoint_to_s3.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | #SBATCH --job-name="llchem-transfer" 3 | #SBATCH --nodes=1 4 | #SBATCH --ntasks-per-node=1 5 | #SBATCH --cpus-per-task=12 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/transfer_%j.out 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/transfer_%j.err 8 | #SBATCH --open-mode=append 9 | #SBATCH --account=topchem 10 | #SBATCH --partition=g40x 11 | #SBATCH --exclusive 12 | 13 | ## This script recursively copies a directory to S3 storage 14 | ### The first arg ($1) is a full path to a checkpoint folder (i.e. <....>/checkpoint-1000) 15 | ### The second arg ($2) is the S3 bucket to copy to (i.e. llchem-models) 16 | 17 | cutat=checkpoints/ 18 | TARGET_DIR=$(echo $1 | awk -F $cutat '{print $2}') # turns /a/b/checkpoints/c/d/ -> c/d/ 19 | PARENT_DIR="$(dirname "$1")" 20 | CHILD_FILE="$(basename "$1")" 21 | 22 | if [ ! -f "$1.tar" ]; then 23 | cd $PARENT_DIR && tar -cvf $CHILD_FILE.tar $CHILD_FILE 24 | fi 25 | 26 | aws s3 cp $1.tar s3://$2/$TARGET_DIR.tar 27 | -------------------------------------------------------------------------------- /experiments/scripts/transfer_hf_cache.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | hf_subdirs=( accelerate datasets hub ) 3 | for sub_dir in "${hf_subdirs[@]}" 4 | do 5 | cp -R ~/.cache/huggingface/$sub_dir/* /fsx/proj-chemnlp/hf_cache/$sub_dir 6 | rm -rf ~/.cache/huggingface/$sub_dir 7 | done 8 | -------------------------------------------------------------------------------- /experiments/working/calculate_nll.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from datasets import load_dataset 3 | from tqdm import tqdm 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | 6 | # See: https://huggingface.co/docs/transformers/perplexity 7 | 8 | MAX_LENGTH = 2048 9 | STRIDE = 512 10 | BASE_STRING = "EleutherAI/pythia-" 11 | PYTHIA_MODELS = ["70m", "160m", "410m", "1b", "1.4b", "2.8b"] 12 | MODELS = [BASE_STRING + x for x in PYTHIA_MODELS] 13 | print(f"Running models: {MODELS}") 14 | 15 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") 16 | 17 | dataset = load_dataset("aeslc")["test"] 18 | print(f"Loaded dataset, size: {len(dataset)}") 19 | 20 | results = {k: None for k in MODELS} 21 | 22 | for model_name in MODELS: 23 | print(f"Starting model: {model_name}") 24 | model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE) 25 | tokenizer = AutoTokenizer.from_pretrained(model_name) 26 | 27 | # Join dataset into one long document: 28 | tokenized_dataset = tokenizer( 29 | "\n\n".join(dataset["email_body"]), return_tensors="pt" 30 | ) 31 | seq_len = tokenized_dataset.input_ids.size(1) 32 | prev_end_loc = 0 33 | nlls = [] 34 | 35 | for begin_loc in tqdm(range(0, seq_len, STRIDE)): 36 | end_loc = min(begin_loc + MAX_LENGTH, seq_len) 37 | trg_len = end_loc - prev_end_loc # may be different from stride on last loop 38 | 39 | # Get data: 40 | input_ids = tokenized_dataset.input_ids[:, begin_loc:end_loc].to(DEVICE) 41 | target_ids = input_ids.clone() 42 | target_ids[:, :-trg_len] = -100 # set all but last trg_len tokens to -100 43 | 44 | # Forward pass; 45 | with torch.no_grad(): 46 | outputs = model(input_ids, labels=target_ids) 47 | 48 | # loss is calculated using CrossEntropyLoss which averages over input tokens. 49 | # Multiply it with trg_len to get the summation instead of average. 50 | # We will take average over all the tokens to get the true average 51 | # in the last step of this example. 52 | neg_log_likelihood = outputs.loss * trg_len 53 | 54 | nlls.append(neg_log_likelihood) 55 | 56 | prev_end_loc = end_loc 57 | if end_loc >= seq_len: 58 | break 59 | 60 | results[model_name] = torch.stack(nlls).sum() / end_loc 61 | print(results) 62 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: ChemNLP Documentation 2 | theme: 3 | name: material 4 | nav: 5 | - Home: index.md 6 | - User Guide: 7 | - Installation: user-guide/installation.md 8 | - Quick Start: user-guide/quickstart.md 9 | - API Reference: 10 | - Sampler Module: api/sampler.md 11 | - Sampler CLI: api/sampler_cli.md 12 | - Meta YAML Generator: api/meta_yaml_generator.md 13 | - Meta YAML Augmentor: api/meta_yaml_augmentor.md 14 | - Examples: 15 | - Basic Usage: examples/basic-usage.md 16 | - Advanced Techniques: examples/advanced-techniques.md 17 | - Contributing: CONTRIBUTING.md 18 | - Changelog: changelog.md 19 | markdown_extensions: 20 | - pymdownx.highlight 21 | - pymdownx.superfences 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "setuptools-scm"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "chemnlp" 7 | description = "Open source chemistry dataset & LLM" 8 | readme = "README.md" 9 | requires-python = "==3.9.*" 10 | dependencies = ["pandas", "pydantic", "pydantic_yaml<=0.11.2", "fire", "loguru"] 11 | dynamic = ["version"] 12 | 13 | [project.optional-dependencies] 14 | dev = ["pre-commit", "pytest"] 15 | dataset_creation = [ 16 | "PyTDC", 17 | "rdkit", 18 | "ruamel.yaml", 19 | "selfies", 20 | "deepsmiles", 21 | "pubchempy", 22 | "bioc", 23 | "pylatexenc", 24 | "canonicalize_psmiles@git+https://github.com/Ramprasad-Group/canonicalize_psmiles.git", 25 | "rxn-chem-utils", 26 | "backoff", 27 | "givemeconformer", 28 | "chembl_webresource_client", 29 | "dask", 30 | "pandarallel" 31 | ] 32 | 33 | [project.scripts] 34 | chemnlp-generate-meta = "chemnlp.data.meta_yaml_generator:cli" 35 | chemnlp-augment-meta = "chemnlp.data.meta_yaml_augmenter:cli" 36 | chemnlp-sample = "chemnlp.data.sampler_cli:cli" 37 | chemnlp-add-random-split-column = "chemnlp.data.utils:add_random_split_column_cli" 38 | chemnlp-concatenate-jsonl = "chemnlp.data.utils:concatenate_jsonl_files_cli" 39 | 40 | [tool.setuptools_scm] 41 | version_scheme = "post-release" 42 | -------------------------------------------------------------------------------- /src/chemnlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/src/chemnlp/__init__.py -------------------------------------------------------------------------------- /src/chemnlp/data/hf_datasets.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | 4 | def boolq(tokenizer): 5 | dataset = load_dataset("boolq") 6 | 7 | def _tokenize_function(example, tokenizer): 8 | all_text = f"Passage:\n{example['passage']} \nQuestion:\n{example['question']}\nAnswer:\n{example['answer']}" 9 | return tokenizer(all_text) 10 | 11 | tokenized = dataset.map( 12 | _tokenize_function, 13 | fn_kwargs={"tokenizer": tokenizer}, 14 | remove_columns=["question", "answer", "passage"], 15 | ) 16 | 17 | return tokenized["train"], tokenized["validation"] 18 | 19 | 20 | def rotten_tomatoes(tokenizer): 21 | dataset = load_dataset("rotten_tomatoes") 22 | 23 | def _tokenize_function(example, tokenizer): 24 | return tokenizer(example["text"]) 25 | 26 | tokenized = dataset.map( 27 | _tokenize_function, 28 | fn_kwargs={"tokenizer": tokenizer}, 29 | remove_columns=["text", "label"], 30 | ) 31 | 32 | return tokenized["train"], tokenized["validation"] 33 | -------------------------------------------------------------------------------- /src/chemnlp/data/meta.yaml: -------------------------------------------------------------------------------- 1 | bibtex: 2 | - "@article{martins2023,\nauthor = {Martins, John and Doe, Jane and Smith, Alice},\ntitle = {Study on Blood-Brain Barrier Penetration of Various Drugs},\njournal = {Journal of Pharmacology},\nvolume = {12},\nnumber = {3},\npages = {123-134},\nyear = {2023},\ndoi = {10.1234/jpharm.2023.56789}}" 3 | description: Describing the ability of different drugs to penetrate the blood-brain barrier. 4 | identifiers: 5 | - description: Simplified Molecular Input Line Entry System 6 | id: SMILES 7 | type: SMILES 8 | - description: Name of the compound 9 | id: compound_name 10 | names: 11 | - noun: compound name 12 | type: Other 13 | license: CC BY 4.0 14 | links: 15 | - description: corresponding publication 16 | url: https://example.com/publication 17 | - description: data source 18 | url: https://example.com/data_source 19 | name: blood_brain_barrier_martins_et_al 20 | num_points: 2030 21 | targets: 22 | - description: Indicates whether the compound can penetrate the blood-brain barrier (1 for yes, 0 for no) 23 | id: penetrate_BBB 24 | names: 25 | - noun: blood-brain barrier penetration 26 | type: integer 27 | templates: 28 | - The compound {compound_name__names__noun} with SMILES {SMILES#} can {#penetrate|not penetrate!} the blood-brain barrier. 29 | - The compound {compound_name__names__noun} with SMILES {SMILES#} is in the {split#} set. 30 | - "Question: Which of the following compounds can penetrate the blood-brain barrier?\nOptions: {%multiple_choice_enum%4%aA1}\n{compound_name%}\nAnswer: {%multiple_choice_result}" 31 | - The compound with SMILES {SMILES#} can penetrate the blood-brain barrier:{penetrate_BBB#} 32 | -------------------------------------------------------------------------------- /src/chemnlp/data/random_variable.py: -------------------------------------------------------------------------------- 1 | import random 2 | from functools import partial 3 | from typing import Callable, Optional 4 | 5 | 6 | def unwrap_list_length_1(list_input: list): 7 | """Unwraps lists of length 1 and returns the first = single element.""" 8 | if isinstance(list_input, list): 9 | assert len(list_input) == 1 10 | return list_input[0] 11 | else: 12 | raise NotImplementedError() 13 | 14 | 15 | class RandomVariable: 16 | """Simple random variable class that takes in a name, data, and a sampler. 17 | The sampler needs to return a single element.""" 18 | 19 | def __init__(self, name: str, data: list, sampler: Optional[Callable] = None): 20 | self.name = name 21 | self.data = data 22 | self.sampler = partial(random.sample, k=1) if sampler is None else sampler 23 | 24 | def __repr__(self): 25 | return f"RandomVariable: {self.name}, {self.data}, {self.sampler}" 26 | 27 | def __call__(self) -> str: 28 | """Carries out sampling and returns a single element.""" 29 | return unwrap_list_length_1(self.sampler(self.data)) 30 | -------------------------------------------------------------------------------- /src/chemnlp/data/reprs.py: -------------------------------------------------------------------------------- 1 | import backoff 2 | import deepsmiles 3 | import pubchempy as pcp 4 | import requests 5 | import selfies 6 | from rdkit import Chem 7 | 8 | 9 | def smiles_to_selfies(smiles: str) -> str: 10 | """ 11 | Takes a SMILES and return the selfies encoding. 12 | """ 13 | 14 | return selfies.encoder(smiles) 15 | 16 | 17 | def smiles_to_deepsmiles(smiles: str) -> str: 18 | """ 19 | Takes a SMILES and return the DeepSMILES encoding. 20 | """ 21 | converter = deepsmiles.Converter(rings=True, branches=True) 22 | return converter.encode(smiles) 23 | 24 | 25 | def smiles_to_canoncial(smiles: str) -> str: 26 | """ 27 | Takes a SMILES and return the canoncial SMILES. 28 | """ 29 | mol = Chem.MolFromSmiles(smiles) 30 | return Chem.MolToSmiles(mol) 31 | 32 | 33 | def smiles_to_inchi(smiles: str) -> str: 34 | """ 35 | Takes a SMILES and return the InChI. 36 | """ 37 | mol = Chem.MolFromSmiles(smiles) 38 | return Chem.MolToInchi(mol) 39 | 40 | 41 | def smiles_to_safe(smiles: str) -> str: 42 | """ 43 | Takes a SMILES and return the SAFE. 44 | """ 45 | import safe 46 | 47 | return safe.encode(smiles, seed=42, canonical=True, randomize=False) 48 | 49 | 50 | CACTUS = "https://cactus.nci.nih.gov/chemical/structure/{0}/{1}" 51 | 52 | 53 | @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10) 54 | def cactus_request_w_backoff(smiles, rep="iupac_name"): 55 | url = CACTUS.format(smiles, rep) 56 | response = requests.get(url, allow_redirects=True, timeout=10) 57 | response.raise_for_status() 58 | name = response.text 59 | if "html" in name: 60 | return None 61 | return name 62 | 63 | 64 | def smiles_to_iupac_name(smiles: str) -> str: 65 | """Use the chemical name resolver https://cactus.nci.nih.gov/chemical/structure. 66 | If this does not work, use pubchem. 67 | """ 68 | try: 69 | name = cactus_request_w_backoff(smiles, rep="iupac_name") 70 | if name is None: 71 | raise Exception 72 | return name 73 | except Exception: 74 | try: 75 | compound = pcp.get_compounds(smiles, "smiles") 76 | return compound[0].iupac_name 77 | except Exception: 78 | return None 79 | -------------------------------------------------------------------------------- /src/chemnlp/data_val/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/src/chemnlp/data_val/__init__.py -------------------------------------------------------------------------------- /src/chemnlp/data_val/validate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | 4 | import fire 5 | 6 | from .model import Dataset 7 | 8 | 9 | def validate_meta(file): 10 | """Validate a metadata file.""" 11 | 12 | try: 13 | with open(file, "r") as f: 14 | _model = Dataset.parse_raw(f.read()) # noqa: F841 15 | except Exception as e: 16 | raise ValueError(f"Error parsing {file}: {e}") 17 | 18 | 19 | def validate_folder(folder): 20 | """Validate all metadata files in a folder.""" 21 | 22 | files = glob(os.path.join(folder, "**", "meta.yaml")) 23 | for file in files: 24 | validate_meta(file) 25 | return True 26 | 27 | 28 | if __name__ == "__main__": 29 | fire.Fire(validate_folder) 30 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/tests/__init__.py -------------------------------------------------------------------------------- /tests/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/tests/data/__init__.py -------------------------------------------------------------------------------- /tests/test_ner.py: -------------------------------------------------------------------------------- 1 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner 2 | 3 | 4 | def test_tokens_by_label(): 5 | tokens = ["a", "b", "c", "d", "e", "f"] 6 | 7 | labels = [0, 1, 1, 0, 1, 0] 8 | grouped_tokens = group_tokens_by_labels(tokens, labels) 9 | assert set(grouped_tokens) == set(["b", "c", "e"]) 10 | 11 | labels = [0, 1, 2, 0, 1, 0] 12 | grouped_tokens = group_tokens_by_labels(tokens, labels) 13 | assert set(grouped_tokens) == set(["b c", "e"]) 14 | 15 | 16 | def test_join_punctuation(): 17 | token_list = [ 18 | "This", 19 | "is", 20 | "a", 21 | "list", 22 | "of", 23 | "tokens", 24 | "with", 25 | "2", 26 | ".", 27 | "5", 28 | ",", 29 | "and", 30 | "3", 31 | "numbers", 32 | "intact", 33 | "semi", 34 | "-", 35 | "colon", 36 | "separated", 37 | "words", 38 | "with", 39 | "decimal", 40 | "numbers", 41 | "split", 42 | "at", 43 | "dots", 44 | ".", 45 | "This", 46 | "is", 47 | "a", 48 | "comma", 49 | ",", 50 | "and", 51 | "a", 52 | "dot", 53 | "(", 54 | "test", 55 | ")", 56 | ".", 57 | ] 58 | sentence = punctuation_joiner(token_list) 59 | print(sentence) 60 | assert ( 61 | sentence 62 | == "This is a list of tokens with 2.5, and 3 numbers intact semi-colon separated words with decimal numbers split at dots. This is a comma, and a dot (test)." # noqa 63 | ) 64 | -------------------------------------------------------------------------------- /tests/test_reprs.py: -------------------------------------------------------------------------------- 1 | from chemnlp.data.reprs import smiles_to_iupac_name 2 | 3 | # not used at the moment 4 | # def test_smiles_to_safe(): 5 | # safe = smiles_to_safe("CC(Cc1ccc(cc1)C(C(=O)O)C)C") 6 | # # equivalent, only rotations, it is not completely deterministic 7 | # assert ( 8 | # safe == "c12ccc3cc1.C3(C)C(=O)O.CC(C)C2" 9 | # or safe == "c13ccc2cc1.C2(C)C(=O)O.CC(C)C3" 10 | # ) 11 | 12 | 13 | def test_smiles_to_iupac_name(): 14 | iupac_name = smiles_to_iupac_name("CC(Cc1ccc(cc1)C(C(=O)O)C)C") 15 | assert iupac_name == "2-[4-(2-methylpropyl)phenyl]propanoic acid" 16 | --------------------------------------------------------------------------------