├── .github
    └── workflows
    │   └── install.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── assets
    └── logo
    │   ├── LLcheD.png
    │   ├── LLcheD.svg
    │   ├── LLcheM.png
    │   ├── LLcheM.svg
    │   ├── chemnlp.png
    │   └── chemnlp.svg
├── code_of_conduct.md
├── conda.yaml
├── data
    ├── check_pandas.py
    ├── check_smiles_split.py
    ├── kg
    │   ├── chebi_chebi
    │   │   └── meta.yaml
    │   ├── chembl33_preprocessed_filtered_bioactivity_dataset_w_fullprotnames_smiles
    │   │   └── meta.yaml
    │   ├── compound_chebi
    │   │   └── meta.yaml
    │   ├── compound_chebi_chebi
    │   │   └── meta.yaml
    │   ├── compound_chebi_chebi_chebi_1
    │   │   └── meta.yaml
    │   ├── compound_chebi_chebi_chebi_2
    │   │   └── meta.yaml
    │   ├── compound_protein
    │   │   └── meta.yaml
    │   ├── compound_protein_compound_1
    │   │   └── meta.yaml
    │   ├── compound_protein_compound_2
    │   │   └── meta.yaml
    │   ├── compound_protein_compound_3
    │   │   └── meta.yaml
    │   ├── compound_protein_disease
    │   │   └── meta.yaml
    │   ├── compound_protein_domain
    │   │   └── meta.yaml
    │   ├── compound_protein_ec_number
    │   │   └── meta.yaml
    │   ├── compound_protein_go_term_1
    │   │   └── meta.yaml
    │   ├── compound_protein_go_term_2
    │   │   └── meta.yaml
    │   ├── compound_protein_go_term_3
    │   │   └── meta.yaml
    │   ├── compound_protein_go_term_4
    │   │   └── meta.yaml
    │   ├── compound_protein_hpo
    │   │   └── meta.yaml
    │   ├── compound_protein_hpo_disease_1
    │   │   └── meta.yaml
    │   ├── compound_protein_hpo_disease_2
    │   │   └── meta.yaml
    │   ├── compound_protein_pathway
    │   │   └── meta.yaml
    │   ├── compound_protein_pathway_disease_1
    │   │   └── meta.yaml
    │   ├── compound_protein_pathway_disease_2
    │   │   └── meta.yaml
    │   ├── compound_protein_pathway_disease_3
    │   │   └── meta.yaml
    │   ├── compound_protein_protein
    │   │   └── meta.yaml
    │   ├── drug_chebi
    │   │   └── meta.yaml
    │   ├── drug_chebi_chebi
    │   │   └── meta.yaml
    │   ├── drug_chebi_chebi_chebi
    │   │   └── meta.yaml
    │   ├── drug_disease_pathway
    │   │   └── meta.yaml
    │   ├── drug_disease_pathway_protein
    │   │   └── meta.yaml
    │   ├── drug_protein
    │   │   └── meta.yaml
    │   ├── drug_protein_disease
    │   │   └── meta.yaml
    │   ├── drug_protein_domain
    │   │   └── meta.yaml
    │   ├── drug_protein_drug
    │   │   └── meta.yaml
    │   ├── drug_protein_ec_number
    │   │   └── meta.yaml
    │   ├── drug_protein_go_term
    │   │   └── meta.yaml
    │   ├── drug_protein_hpo
    │   │   └── meta.yaml
    │   ├── drug_protein_hpo_disease
    │   │   └── meta.yaml
    │   ├── drug_protein_pathway
    │   │   └── meta.yaml
    │   ├── drug_protein_pathway_disease
    │   │   └── meta.yaml
    │   └── drug_protein_protein
    │   │   └── meta.yaml
    ├── natural
    │   ├── preprocess_europepmc.py
    │   ├── preprocess_msds.py
    │   ├── preprocess_nougat.py
    │   └── preprocess_nougat.sh
    ├── postprocess_split.py
    ├── tabular
    │   ├── BACE
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── BBBP
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_466
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_548
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_600
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_644
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_652
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_689
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_692
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_712
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_713
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_733
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_737
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_810
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_832
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_846
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_852
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_858
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── MUV_859
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── RedDB
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── SIDER
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ames_mutagenicity
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── aminoacids
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── bc5chem
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── bc5disease
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── bicerano_dataset
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── bio_ner
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── bioavailability_ma_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── block_polymers_morphology
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── blood_brain_barrier_martins_et_al
    │   │   └── transform.py
    │   ├── buchwald_hartwig
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── caco2_wang
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── carcinogens
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cav3_t-type_calcium_channels_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── chebi_20
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── check_pandas.py
    │   ├── check_smiles_split.py
    │   ├── chem_caption_smarts
    │   │   ├── meta.yaml
    │   │   ├── preprocess.py
    │   │   └── transform.py
    │   ├── chembl_v29
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── chemcaption_fragments
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── chemcaption_rdkit
    │   │   ├── meta.yaml
    │   │   ├── preprocess.py
    │   │   └── transform.py
    │   ├── chemdner
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── chemistry_stackexchange
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── choline_transporter_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── clearance_astrazeneca
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── clintox
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── core_mof_no_topo
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp2c9_substrate_carbonmangels
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp2d6_substrate_carbonmangels
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp3a4_substrate_carbonmangels
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp_p450_1a2_inhibition_veith_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp_p450_2c19_inhibition_veith_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp_p450_2c9_inhibition_veith_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp_p450_2d6_inhibition_veith_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── cyp_p450_3a4_inhibition_veith_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── drug_induced_liver_injury
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── drugchat_liang_zhang_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── fda_adverse_reactions
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── flashpoint
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── formation_energies
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── freesolv
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── h2_storage_materials
    │   │   ├── LICENSE
    │   │   ├── meta.yaml
    │   │   ├── processing.ipynb
    │   │   └── transform.py
    │   ├── half_life_obach
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── herg_blockers
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── herg_central_at_10uM
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── herg_central_at_1uM
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── herg_central_inhib
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── herg_karim_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── hiv
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── human_intestinal_absorption
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── inverse_1
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── inverse_2
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── inverse_3
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── iupac_goldbook
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── iupac_smiles
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── kcnq2_potassium_channel_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ld50_catmos
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ld50_zhu
    │   │   ├── example_processing_and_templates.ipynb
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── lipophilicity
    │   │   ├── data_original.txt
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── m1_muscarinic_receptor_agonists_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── m1_muscarinic_receptor_antagonists_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── mattermodeling_stackexchange
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── melting_points
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── merge.py
    │   ├── mofdscribe
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── mol2svg
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── mol_repr_transl
    │   │   └── transform.py
    │   ├── mona
    │   │   ├── example_processing_and_templates.ipynb
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── moses
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── mp_anisotropy
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── mp_bulk_modulus
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── mp_descriptions
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── mp_self_supervised
    │   │   ├── meta.yaml
    │   │   ├── prepare_data.py
    │   │   └── transform.py
    │   ├── mp_shear_modulus
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ncbi_disease
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nlmchem
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nomad_structure
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nr_ahr_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nr_ar_lbd_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nr_ar_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nr_aromatase_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nr_er_lbd_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nr_er_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── nr_ppar_gamma_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ocp
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── odd_one_out
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── opv
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── oqmd
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── orbnet_denali
    │   │   ├── develop_transform.ipynb
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ord_masked
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ord_predictions
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ord_procedure_steps
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ord_rxn_smiles_procedure
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ord_rxn_smiles_yield_pred
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── ord_steps_yield
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── orexin1_receptor_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── p_glycoprotein_inhibition_broccatelli_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── pampa_ncats
    │   │   ├── example_processing_and_templates.ipynb
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── peptides_hemolytic
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── peptides_nonfouling
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── peptides_soluble
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── perovskite_db
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── physics_stackexchange
    │   │   ├── explore.ipynb
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── potassium_ion_channel_kir2_1_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── qm8
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── qm9
    │   │   ├── meta.yaml
    │   │   ├── prep_csv.py
    │   │   └── transform.py
    │   ├── qmof_gcmc
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── qmof_quantum
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── rdkit_features
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── rhea_db_masked
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── rhea_db_predictions
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── run_all_transform.sh
    │   ├── sarscov2_3clpro_diamond
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── sarscov2_vitro_touret
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── serine_threonine_kinase_33_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── sigma_aldrich_safety_data
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── skin_reaction
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── smiles_to_3d
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── solubility_aqsoldb
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── sr_are_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── sr_atad5_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── sr_hse_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── sr_mmp_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── sr_p53_tox21
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── suzuki_miyaura_sach
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── thermosol
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── train_test_split.py
    │   ├── tyrosyl-dna_phosphodiesterase_butkiewicz
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── uniprot_binding_single
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── uniprot_binding_sites_multiple
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── uniprot_organisms
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── uniprot_reactions
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── uniprot_sentences
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── uspto
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── uspto_yield
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   ├── volume_of_distribution_at_steady_state_lombardo_et_al
    │   │   ├── meta.yaml
    │   │   └── transform.py
    │   └── zinc
    │   │   ├── meta.yaml
    │   │   └── transform.py
    ├── text_sampling
    │   ├── extend_tabular.py
    │   ├── extend_tabular_processed.py
    │   ├── get_dataset_overlap.py
    │   ├── preprocess_kg.py
    │   ├── text_sampling.py
    │   └── utils.py
    └── train_test_split.py
├── docs
    ├── CONTRIBUTING.md
    ├── api
    │   ├── meta_yaml_augmentor.md
    │   ├── meta_yaml_generator.md
    │   ├── sampler.md
    │   └── sampler_cli.md
    └── index.md
├── experiments
    ├── README.md
    ├── ablations
    │   ├── 20240814_sample_data.bash
    │   └── continued_pretrain.py
    ├── configs
    │   ├── data_configs
    │   │   ├── data_mixing.yml
    │   │   ├── hf_data.yml
    │   │   ├── hf_data_wiki.yml
    │   │   ├── prep_lm_eval_data.yml
    │   │   └── prep_smiles_data.yml
    │   ├── deepspeed
    │   │   ├── deepspeed_S1.json
    │   │   ├── deepspeed_S2.json
    │   │   ├── deepspeed_offload_S2.json
    │   │   └── deepspeed_offload_S3.json
    │   ├── eval_configs
    │   │   ├── default_eval_config.yaml
    │   │   ├── nlp_eval_config.yaml
    │   │   ├── safety_eval_config.yaml
    │   │   └── stem_eval_config.yaml
    │   ├── gpt-neox
    │   │   ├── 160M.yml
    │   │   ├── cluster_setup.yml
    │   │   └── soft_prompt.yml
    │   └── hugging-face
    │   │   ├── 160M_full.yml
    │   │   ├── 160M_ptune.yml
    │   │   ├── 1B_fine_tune.yml
    │   │   ├── 3B_fine_tune.yml
    │   │   ├── 410M_fine_tune.yml
    │   │   └── 7B_fine_tune.yml
    ├── data
    │   ├── merge_epmc_to_jsonl.py
    │   ├── prepare_gptneox_chemrxiv.py
    │   ├── prepare_hf_dataset.py
    │   ├── prepare_lm_eval_dataset.py
    │   ├── prepare_mixed_data.py
    │   ├── prepare_smiles_dataset.py
    │   ├── prepare_xyz_denali_data.py
    │   ├── sbatch_hf_dataset.sh
    │   ├── sbatch_hf_split.sh
    │   ├── sbatch_merge_epmc_jsonl.sh
    │   └── split_data.py
    ├── scripts
    │   ├── env_creation_hf.sh
    │   ├── env_creation_neox.sh
    │   ├── eval_create_batch_configs.py
    │   ├── miniconda_install.sh
    │   ├── run_eval.sh
    │   ├── run_eval_batch.sh
    │   ├── run_grid_search.py
    │   ├── run_n_shot_benchmarks_eval.py
    │   ├── run_tune.py
    │   ├── sbatch_train_hf.sh
    │   ├── sbatch_train_hf_multinode.sh
    │   ├── sbatch_train_neox.sh
    │   ├── transfer_all_checkpoint_to_s3.sh
    │   ├── transfer_checkpoint_to_s3.sh
    │   └── transfer_hf_cache.sh
    └── working
    │   └── calculate_nll.py
├── mkdocs.yml
├── pyproject.toml
├── src
    └── chemnlp
    │   ├── __init__.py
    │   ├── data
    │       ├── constants.py
    │       ├── convert.py
    │       ├── hf_datasets.py
    │       ├── meta.yaml
    │       ├── meta_yaml_augmentor.py
    │       ├── meta_yaml_generator.py
    │       ├── ner.py
    │       ├── random_variable.py
    │       ├── reprs.py
    │       ├── sampler.py
    │       ├── sampler_cli.py
    │       ├── split.py
    │       └── utils.py
    │   ├── data_val
    │       ├── __init__.py
    │       ├── config.py
    │       ├── model.py
    │       └── validate.py
    │   └── utils.py
└── tests
    ├── __init__.py
    ├── data
        ├── __init__.py
        ├── test_sampler.py
        └── test_sampler_cli.py
    ├── test_ner.py
    └── test_reprs.py


/.github/workflows/install.yaml:
--------------------------------------------------------------------------------
 1 | # GitHub action that attempts to install the conda env
 2 | # from conda.yaml
 3 | # then run black, isort, flake8
 4 | name: Install
 5 | on: [push, pull_request]
 6 | jobs:
 7 |   install:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - uses: actions/checkout@v2
11 |       - uses: conda-incubator/setup-miniconda@v2
12 |         with:
13 |           environment-file: conda.yaml
14 |           activate-environment: chemnlp
15 |           python-version: 3.9
16 |           auto-update-conda: true
17 |           auto-activate-base: false
18 |       - name: Validate yaml
19 |         shell: bash -l {0}
20 |         run: |
21 |           conda activate chemnlp
22 |           python -m src.chemnlp.data_val.validate data
23 |       - name: Tests
24 |         shell: bash -l {0}
25 |         run: |
26 |           pip install pytest
27 |           pytest tests
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 OpenBioML
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/logo/LLcheD.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/assets/logo/LLcheD.png


--------------------------------------------------------------------------------
/assets/logo/LLcheM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/assets/logo/LLcheM.png


--------------------------------------------------------------------------------
/assets/logo/chemnlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/assets/logo/chemnlp.png


--------------------------------------------------------------------------------
/conda.yaml:
--------------------------------------------------------------------------------
1 | name: dummy
2 | dependencies:
3 |   - python==3.9.*
4 |   - pip
5 |   - pip:
6 |       - .
7 |       - .[dev]
8 |       - .[dataset_creation]
9 | 


--------------------------------------------------------------------------------
/data/kg/chebi_chebi/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: chebi_chebi
 2 | description: Knowledgegraph data samples.
 3 | targets:
 4 |   - id: node2_type
 5 |     description: node2_type
 6 |     type: Other
 7 |     units: node2_type
 8 |     names:
 9 |       - noun: node2_type
10 |   - id: node2_name
11 |     description: node2_name
12 |     type: Other
13 |     units: node2_name
14 |     names:
15 |       - noun: node2_name
16 |   - id: node2_id
17 |     description: node2_id
18 |     type: Other
19 |     units: node2_id
20 |     names:
21 |       - noun: node2_id
22 | identifiers:
23 |   - id: node1_type
24 |     description: node1_type
25 |     type: Other
26 |   - id: node1_name
27 |     description: node1_name
28 |     type: Other
29 |   - id: node1_id
30 |     description: node1_id
31 |     type: Other
32 |   - id: rel1_type
33 |     description: rel1_type
34 |     type: Other
35 | license: CC BY 4.0
36 | links:
37 |   - url: https://crossbar.kansil.org
38 |     description: original knowledge graph web GUI link
39 | num_points: 638182
40 | bibtex:
41 |   - "@article{10.1093/nar/gkab543,\nauthor = {Doğan, Tunca and Atas, Heval and Joshi, Vishal and Atakan, Ahmet and Rifaioglu, Ahmet Sureyya and Nalbat, Esra and Nightingale, Andrew and Saidi, Rabie and Volynkin, Vladimir and Zellner, Hermann and Cetin-Atalay, Rengul and Martin, Maria and Atalay, Volkan},\ntitle = \"{CROssBAR: comprehensive resource of biomedical relations with knowledge graph representations}\",\njournal = {Nucleic Acids Research},\nvolume = {49},\nnumber = {16},\npages = {e96-e96},\nyear = {2021},\nmonth = {06},\nissn = {0305-1048},\ndoi = {10.1093/nar/gkab543},\nurl = {https://doi.org/10.1093/nar/gkab543},\n}"
42 | templates:
43 |   - The {node1_name#} {rel1_type#} {node2_name#}.
44 | 


--------------------------------------------------------------------------------
/data/kg/compound_chebi/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: compound_chebi
 2 | description: Knowledgegraph data samples.
 3 | targets:
 4 |   - id: node1_type
 5 |     description: node1_type
 6 |     type: Other
 7 |     units: node1_type
 8 |     names:
 9 |       - noun: node1_type
10 |   - id: node1_name
11 |     description: node1_name
12 |     type: Other
13 |     units: node1_name
14 |     names:
15 |       - noun: node1_name
16 |   - id: node1_id
17 |     description: node1_id
18 |     type: Other
19 |     units: node1_id
20 |     names:
21 |       - noun: node1_id
22 |   - id: rel1_type
23 |     description: rel1_type
24 |     type: Other
25 |     units: rel1_type
26 |     names:
27 |       - noun: rel1_type
28 |   - id: node2_type
29 |     description: node2_type
30 |     type: Other
31 |     units: node2_type
32 |     names:
33 |       - noun: node2_type
34 |   - id: node2_name
35 |     description: node2_name
36 |     type: Other
37 |     units: node2_name
38 |     names:
39 |       - noun: node2_name
40 |   - id: node2_id
41 |     description: node2_id
42 |     type: Other
43 |     units: node2_id
44 |     names:
45 |       - noun: node2_id
46 | identifiers:
47 |   - id: SMILES
48 |     description: SMILES
49 |     type: SMILES
50 | license: CC BY 4.0
51 | links:
52 |   - url: https://crossbar.kansil.org
53 |     description: original knowledge graph web GUI link
54 | num_points: 6754
55 | bibtex:
56 |   - "@article{10.1093/nar/gkab543,\nauthor = {Doğan, Tunca and Atas, Heval and Joshi, Vishal and Atakan, Ahmet and Rifaioglu, Ahmet Sureyya and Nalbat, Esra and Nightingale, Andrew and Saidi, Rabie and Volynkin, Vladimir and Zellner, Hermann and Cetin-Atalay, Rengul and Martin, Maria and Atalay, Volkan},\ntitle = \"{CROssBAR: comprehensive resource of biomedical relations with knowledge graph representations}\",\njournal = {Nucleic Acids Research},\nvolume = {49},\nnumber = {16},\npages = {e96-e96},\nyear = {2021},\nmonth = {06},\nissn = {0305-1048},\ndoi = {10.1093/nar/gkab543},\nurl = {https://doi.org/10.1093/nar/gkab543},\n}"
57 | templates:
58 |   - The {node1_type#} {SMILES__description} {SMILES#} {rel1_type#} {node2_name#}.
59 |   - |-
60 |     Task: Please {#create|generate!} {#a compound |a !}{SMILES__description} that {rel1_type#} {node2_name#}.
61 |     Result: {SMILES#}
62 |   - |-
63 |     Task: Please {#create|generate!} {#a compound |a !}{SMILES__description} that {rel1_type#} {node2_name#}.
64 |     Result:<EOI> {SMILES#}
65 | 


--------------------------------------------------------------------------------
/data/kg/drug_chebi/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: drug_chebi
 2 | description: Knowledgegraph data samples.
 3 | targets:
 4 |   - id: node1_type
 5 |     description: node1_type
 6 |     type: Other
 7 |     units: node1_type
 8 |     names:
 9 |       - noun: node1_type
10 |   - id: node1_name
11 |     description: node1_name
12 |     type: Other
13 |     units: node1_name
14 |     names:
15 |       - noun: node1_name
16 |   - id: node1_id
17 |     description: node1_id
18 |     type: Other
19 |     units: node1_id
20 |     names:
21 |       - noun: node1_id
22 |   - id: rel1_type
23 |     description: rel1_type
24 |     type: Other
25 |     units: rel1_type
26 |     names:
27 |       - noun: rel1_type
28 |   - id: node2_type
29 |     description: node2_type
30 |     type: Other
31 |     units: node2_type
32 |     names:
33 |       - noun: node2_type
34 |   - id: node2_name
35 |     description: node2_name
36 |     type: Other
37 |     units: node2_name
38 |     names:
39 |       - noun: node2_name
40 |   - id: node2_id
41 |     description: node2_id
42 |     type: Other
43 |     units: node2_id
44 |     names:
45 |       - noun: node2_id
46 | identifiers:
47 |   - id: SMILES
48 |     description: SMILES
49 |     type: SMILES
50 | license: CC BY 4.0
51 | links:
52 |   - url: https://crossbar.kansil.org
53 |     description: original knowledge graph web GUI link
54 | num_points: 3033
55 | bibtex:
56 |   - "@article{10.1093/nar/gkab543,\nauthor = {Doğan, Tunca and Atas, Heval and Joshi, Vishal and Atakan, Ahmet and Rifaioglu, Ahmet Sureyya and Nalbat, Esra and Nightingale, Andrew and Saidi, Rabie and Volynkin, Vladimir and Zellner, Hermann and Cetin-Atalay, Rengul and Martin, Maria and Atalay, Volkan},\ntitle = \"{CROssBAR: comprehensive resource of biomedical relations with knowledge graph representations}\",\njournal = {Nucleic Acids Research},\nvolume = {49},\nnumber = {16},\npages = {e96-e96},\nyear = {2021},\nmonth = {06},\nissn = {0305-1048},\ndoi = {10.1093/nar/gkab543},\nurl = {https://doi.org/10.1093/nar/gkab543},\n}"
57 | templates:
58 |   - The {node1_type#} {SMILES#} {rel1_type#} {node2_name#}.
59 | 


--------------------------------------------------------------------------------
/data/natural/preprocess_msds.py:
--------------------------------------------------------------------------------
 1 | """This script parses MSDS data parsed from Sigma Aldrich
 2 | (https://huggingface.co/datasets/chemNLP/MSDS/tree/main) and flattens it.
 3 | 
 4 | You need to change filepaths before running this script
 5 | """
 6 | 
 7 | import json
 8 | import os
 9 | 
10 | 
11 | def get_text(d, text="", level=1, linebreaks=2):
12 |     for k in d:
13 |         if k in [
14 |             "SECTION 6: Accidental release measures",  # always empty
15 |             "SECTION 1: Toxicological information",  # always empty
16 |             "SECTION 16: Other information",  # always the same information
17 |         ]:
18 |             continue
19 | 
20 |         text += "#" * level + " " + k + "\n" * linebreaks
21 | 
22 |         if isinstance(d[k], str):
23 |             if d[k] != "":
24 |                 text += d[k].rstrip() + "\n" * linebreaks
25 |         elif isinstance(d[k], dict):
26 |             text = get_text(d[k], text=text, level=level + 1)
27 |     return text
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     path_jsonl_in = "/fsx/proj-chemnlp/micpie/chemnlp/data/natural/msds/msds.jsonl"
32 | 
33 |     # load
34 |     with open(path_jsonl_in) as f:
35 |         data = [json.loads(line) for line in f]
36 | 
37 |     # process
38 |     data = list(map(get_text, data))
39 |     data = [{"text": x} for x in data]
40 | 
41 |     # save
42 |     path_jsonl_out = path_jsonl_in.replace(".jsonl", "_clean.jsonl")
43 |     if os.path.isfile(path_jsonl_out):
44 |         print(f"Output file already exists, please check: {path_jsonl_out}")
45 |     else:
46 |         with open(path_jsonl_out, "a") as fout:
47 |             for sample in data:
48 |                 fout.write(json.dumps(sample) + "\n")
49 |         print(f"JSONL saved to: {path_jsonl_out}")
50 | 


--------------------------------------------------------------------------------
/data/natural/preprocess_nougat.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=preprocess_nougat
 3 | #SBATCH --output=/fsx/proj-chemnlp/micpie/chemnlp/data/natural/%x_%j.out
 4 | #SBATCH --account chemnlp
 5 | #SBATCH --comment chemnlp
 6 | #SBATCH --partition=cpu16
 7 | #SBATCH --ntasks-per-node=1
 8 | #SBATCH --cpus-per-task=1
 9 | 
10 | cd /fsx/proj-chemnlp/micpie/chemnlp/data/natural/
11 | 
12 | ## ensure we can use activate syntax in slurm scripts
13 | export CONDA_ENV_PATH=/admin/home-micpie/miniconda3/envs/chemnlp
14 | CONDA_BASE=$(conda info --base)
15 | source $CONDA_BASE/etc/profile.d/conda.sh
16 | conda activate ${CONDA_ENV_PATH}
17 | 
18 | python --version
19 | 
20 | python preprocess_nougat.py
21 | 
22 | #DATE=$(date -d "today" +"%Y%m%d%H%M")
23 | #echo $DATE
24 | 
25 | #mv /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv.jsonl /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv_$DATE.jsonl
26 | #tar -cvzf /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv_$DATE.jsonl.tar.gz /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_chemrxiv_$DATE.jsonl
27 | 
28 | #mv /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv.jsonl /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv_$DATE.jsonl
29 | #tar -cvzf /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv_$DATE.jsonl.tar.gz /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_biorxiv_$DATE.jsonl
30 | 
31 | #mv /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv.jsonl /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv_$DATE.jsonl
32 | #tar -cvzf /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv_$DATE.jsonl.tar.gz /fsx/proj-chemnlp/micpie/chemnlp/data/natural/nougat_processed_medrxiv_$DATE.jsonl
33 | 


--------------------------------------------------------------------------------
/data/tabular/BACE/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def load_dataset() -> pd.DataFrame:
 5 |     bace = pd.read_csv(
 6 |         "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv"
 7 |     )
 8 |     return bace
 9 | 
10 | 
11 | def transform_data():
12 |     bace = load_dataset()
13 |     bace = bace.rename(columns={"mol": "SMILES", "Class": "BACE_inhibition"})
14 |     bace = bace.drop(columns=["CID", "Model", "canvasUID"])
15 | 
16 |     # Keeping only qualitative and quantitative pIC50 values
17 |     # Removing all the RDKit computed descriptors
18 |     cols_to_keep = ["SMILES", "pIC50", "BACE_inhibition"]
19 |     bace = bace[cols_to_keep]
20 |     bace.to_csv("data_clean.csv", index=False)
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     transform_data()
25 | 


--------------------------------------------------------------------------------
/data/tabular/BBBP/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def load_dataset():
 5 |     BBBP = pd.read_csv(
 6 |         "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv"
 7 |     )
 8 |     return BBBP
 9 | 
10 | 
11 | def transform_data():
12 |     BBBP = load_dataset()
13 | 
14 |     cols_to_keep = [
15 |         "smiles",
16 |         "p_np",
17 |     ]
18 | 
19 |     BBBP = BBBP[cols_to_keep]
20 |     BBBP = BBBP.rename(columns={"smiles": "SMILES"})
21 |     BBBP.to_csv("data_clean.csv", index=False)
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     transform_data()
26 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_466/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_466
 2 | description: Activity in the MUV_466 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-466
 9 |     type: boolean
10 |     description: MUV-466
11 |     names:
12 |       - noun: an agonist of the S1P1 receptor
13 | license: CC BY 4.0
14 | links:
15 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
16 |     description: corresponding publication
17 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
18 |     description: Data source
19 | num_points: 14841
20 | bibtex:
21 |   - |
22 |     @article{doi:10.1021/ci8002649,
23 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
24 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
25 |     journal = {Journal of Chemical Information and Modeling},
26 |     volume = {49},
27 |     number = {2},
28 |     pages = {169-184},
29 |     year = {2009},
30 |     doi = {10.1021/ci8002649},
31 |     URL = {https://doi.org/10.1021/ci8002649}}
32 | templates:
33 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-466#not &NULL}{MUV-466__names__noun}.
34 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_466/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_466/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_548/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_548
 2 | description: Activity in the MUV_548 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-548
 9 |     type: boolean
10 |     description: MUV-548
11 |     names:
12 |       - noun: an inhibitor of the protein kinase A (PKA)
13 |       - noun: an inhibitor of the protein kinase A
14 |       - noun: an inhibitor of PKA
15 | license: CC BY 4.0
16 | links:
17 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
18 |     description: corresponding publication
19 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
20 |     description: Data source
21 | num_points: 14734
22 | bibtex:
23 |   - |
24 |     @article{doi:10.1021/ci8002649,
25 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
26 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
27 |     journal = {Journal of Chemical Information and Modeling},
28 |     volume = {49},
29 |     number = {2},
30 |     pages = {169-184},
31 |     year = {2009},
32 |     doi = {10.1021/ci8002649},
33 |     URL = {https://doi.org/10.1021/ci8002649}}
34 | templates:
35 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-548#not &NULL}{MUV-548__names__noun}.
36 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_548/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_548/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_600/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_600
 2 | description: Activity in the MUV_600 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-600
 9 |     type: boolean
10 |     description: MUV-600
11 |     names:
12 |       - noun: an inhibitor of the steroidogenic factor 1 (SF-1)
13 |       - noun: an inhibitor of SF-1
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14728
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-600#not &NULL}{MUV-600__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_600/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_600/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_644/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_644
 2 | description: Activity in the MUV_644 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-644
 9 |     type: boolean
10 |     description: MUV-644
11 |     names:
12 |       - noun: an inhibitor of Rho-kinase 2 (ROCK-2)
13 |       - noun: an inhibitor of ROCK-2
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14623
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-644#not &NULL}{MUV-644__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_644/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_644/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_652/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_652
 2 | description: Activity in the MUV_652 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-652
 9 |     type: boolean
10 |     description: MUV-652
11 |     names:
12 |       - noun: an inhibitor of HIV RT-RNase
13 | license: CC BY 4.0
14 | links:
15 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
16 |     description: corresponding publication
17 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
18 |     description: Data source
19 | num_points: 14902
20 | bibtex:
21 |   - |
22 |     @article{doi:10.1021/ci8002649,
23 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
24 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
25 |     journal = {Journal of Chemical Information and Modeling},
26 |     volume = {49},
27 |     number = {2},
28 |     pages = {169-184},
29 |     year = {2009},
30 |     doi = {10.1021/ci8002649},
31 |     URL = {https://doi.org/10.1021/ci8002649}}
32 | templates:
33 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-652#not &NULL}{MUV-652__names__noun}.
34 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_652/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_652/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_689/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_689
 2 | description: Activity in the MUV_689 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-689
 9 |     type: boolean
10 |     description: MUV-689
11 |     names:
12 |       - noun: an inhibitor of the EPH receptor A4
13 | license: CC BY 4.0
14 | links:
15 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
16 |     description: corresponding publication
17 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
18 |     description: Data source
19 | num_points: 14601
20 | bibtex:
21 |   - |
22 |     @article{doi:10.1021/ci8002649,
23 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
24 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
25 |     journal = {Journal of Chemical Information and Modeling},
26 |     volume = {49},
27 |     number = {2},
28 |     pages = {169-184},
29 |     year = {2009},
30 |     doi = {10.1021/ci8002649},
31 |     URL = {https://doi.org/10.1021/ci8002649}}
32 | templates:
33 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-689#not &NULL}{MUV-689__names__noun}.
34 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_689/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_689/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_692/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_692
 2 | description: Activity in the MUV_692 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-692
 9 |     type: boolean
10 |     description: MUV-692
11 |     names:
12 |       - noun: an agonist of the steroidogenic factor 1 (SF-1)
13 |       - noun: an agonist of SF-1
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14644
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-692#not &NULL}{MUV-692__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_692/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_692/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_712/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_712
 2 | description: Activity in the MUV_712 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-712
 9 |     type: boolean
10 |     description: MUV-712
11 |     names:
12 |       - noun: an inhibitor of the heat shock protein 90
13 |       - noun: an inhibitor of HSP90
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14411
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-712#not &NULL}{MUV-712__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_712/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_712/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_713/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_713
 2 | description: Activity in the MUV_713 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-713
 9 |     type: boolean
10 |     description: MUV-713
11 |     names:
12 |       - noun: an inhibitor of the estrogen receptor-alpha-coactivator binding
13 |       - noun: an inhibitor of the ER-alpha-coact. binding
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14836
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-713#not &NULL}{MUV-713__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_713/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_713/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_733/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_733
 2 | description: Activity in the MUV_733 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-733
 9 |     type: boolean
10 |     description: MUV-733
11 |     names:
12 |       - noun: an inhibitor of the estrogen receptor-alpha-coactivator binding
13 | license: CC BY 4.0
14 | links:
15 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
16 |     description: corresponding publication
17 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
18 |     description: Data source
19 | num_points: 14682
20 | bibtex:
21 |   - |
22 |     @article{doi:10.1021/ci8002649,
23 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
24 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
25 |     journal = {Journal of Chemical Information and Modeling},
26 |     volume = {49},
27 |     number = {2},
28 |     pages = {169-184},
29 |     year = {2009},
30 |     doi = {10.1021/ci8002649},
31 |     URL = {https://doi.org/10.1021/ci8002649}}
32 | templates:
33 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-733#not &NULL}{MUV-733__names__noun}.
34 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_733/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_733/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_737/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_737
 2 | description: Activity in the MUV_737 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-737
 9 |     type: boolean
10 |     description: MUV-737
11 |     names:
12 |       - noun: a potentiator of the estrogen receptor-alpha-coactivator binding
13 |       - noun: a potentiator of the ER-alpha-coact. binding
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14691
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-737#not &NULL}{MUV-737__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_737/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_737/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_810/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_810
 2 | description: Activity in the MUV_810 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-810
 9 |     type: boolean
10 |     description: MUV-810
11 |     names:
12 |       - noun: an inhibitor of the focal adhesion kinase
13 |       - noun: an inhibitor of FAK
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14644
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-810#not &NULL}{MUV-810__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_810/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_810/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_832/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_832
 2 | description: Activity in the MUV_832 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-832
 9 |     type: boolean
10 |     description: MUV-832
11 |     names:
12 |       - noun: an inhibitor of the Cathepsin G protease
13 | license: CC BY 4.0
14 | links:
15 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
16 |     description: corresponding publication
17 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
18 |     description: Data source
19 | num_points: 14667
20 | bibtex:
21 |   - |
22 |     @article{doi:10.1021/ci8002649,
23 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
24 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
25 |     journal = {Journal of Chemical Information and Modeling},
26 |     volume = {49},
27 |     number = {2},
28 |     pages = {169-184},
29 |     year = {2009},
30 |     doi = {10.1021/ci8002649},
31 |     URL = {https://doi.org/10.1021/ci8002649}}
32 | templates:
33 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-832#not &NULL}{MUV-832__names__noun}.
34 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_832/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_832/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_846/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_846
 2 | description: Activity in the MUV_846 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-846
 9 |     type: boolean
10 |     description: MUV-846
11 |     names:
12 |       - noun: an inhibitor of factor XIa (FXIa)
13 | license: CC BY 4.0
14 | links:
15 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
16 |     description: corresponding publication
17 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
18 |     description: Data source
19 | num_points: 14711
20 | bibtex:
21 |   - |
22 |     @article{doi:10.1021/ci8002649,
23 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
24 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
25 |     journal = {Journal of Chemical Information and Modeling},
26 |     volume = {49},
27 |     number = {2},
28 |     pages = {169-184},
29 |     year = {2009},
30 |     doi = {10.1021/ci8002649},
31 |     URL = {https://doi.org/10.1021/ci8002649}}
32 | templates:
33 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-846#not &NULL}{MUV-846__names__noun}.
34 |   - |-
35 |     Question: Is the {SMILES__description} {SMILES#} {MUV-846__names__noun}?
36 | 
37 |     Answer:<EOI>{MUV-846#no&yes}
38 |   - |-
39 |     Task: Please {#determine|predict|estimate!} if the {#molecule|compound!} with the {SMILES__description} {SMILES#} is {MUV-846__names__noun}.
40 | 
41 |     Result:<EOI>{MUV-846#no&yes}
42 |   - |-
43 |     Task: Please {#create|generate!} a {#molecule|compound!} that is {MUV-846#not &NULL}{MUV-846__names__noun} and report its {SMILES__description}.
44 | 
45 |     Result:<EOI>{SMILES#}
46 |   - |-
47 |     {#Create|Generate|Propose|Suggest|Design|Invent!} a {#molecule|compound!} that is {MUV-846#not &NULL}{MUV-846__names__noun} and report its {SMILES__description}.
48 | 
49 |     Result: {SMILES#}
50 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_846/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_846/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_852/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_852
 2 | description: Activity in the MUV_852 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-852
 9 |     type: boolean
10 |     description: MUV-852
11 |     names:
12 |       - noun: an inhibitor of factor XIIa (FXIIa)
13 | license: CC BY 4.0
14 | links:
15 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
16 |     description: corresponding publication
17 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
18 |     description: Data source
19 | num_points: 14651
20 | bibtex:
21 |   - |
22 |     @article{doi:10.1021/ci8002649,
23 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
24 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
25 |     journal = {Journal of Chemical Information and Modeling},
26 |     volume = {49},
27 |     number = {2},
28 |     pages = {169-184},
29 |     year = {2009},
30 |     doi = {10.1021/ci8002649},
31 |     URL = {https://doi.org/10.1021/ci8002649}}
32 | templates:
33 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-852#not &NULL}{MUV-852__names__noun}.
34 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_852/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_852/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_858/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_858
 2 | description: Activity in the MUV_858 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-858
 9 |     type: boolean
10 |     description: MUV-858
11 |     names:
12 |       - noun: an allosteric modulator of the dopamine receptor D1
13 |       - noun: an allosteric modulator of the D1 receptor
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14774
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-858#not &NULL}{MUV-858__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_858/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_858/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_859/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: MUV_859
 2 | description: Activity in the MUV_859 assay
 3 | identifiers:
 4 |   - id: SMILES
 5 |     type: SMILES
 6 |     description: SMILES
 7 | targets:
 8 |   - id: MUV-859
 9 |     type: boolean
10 |     description: MUV-859
11 |     names:
12 |       - noun: an allosteric inhibitor of the muscarinic acetylcholine receptor M1
13 |       - noun: an allosteric inhibitor of the M1 receptor
14 | license: CC BY 4.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/44/D1/D1075/2502602?login=false
17 |     description: corresponding publication
18 |   - url: https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/muv.csv.gz
19 |     description: Data source
20 | num_points: 14746
21 | bibtex:
22 |   - |
23 |     @article{doi:10.1021/ci8002649,
24 |     author = {Rohrer, Sebastian G. and Baumann, Knut},
25 |     title = {Maximum Unbiased Validation (MUV) Data Sets for Virtual Screening Based on PubChem Bioactivity Data},
26 |     journal = {Journal of Chemical Information and Modeling},
27 |     volume = {49},
28 |     number = {2},
29 |     pages = {169-184},
30 |     year = {2009},
31 |     doi = {10.1021/ci8002649},
32 |     URL = {https://doi.org/10.1021/ci8002649}}
33 | templates:
34 |   - The {#molecule|compound|chemical|molecular species|chemical compound!} with the {SMILES__description} {#representation of |!}{SMILES#} is {MUV-859#not &NULL}{MUV-859__names__noun}.
35 | 


--------------------------------------------------------------------------------
/data/tabular/MUV_859/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/MUV", filename="MUV_859/data_clean.csv", repo_type="dataset"
 8 |     )
 9 |     df = pd.read_csv(file)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     transform_data()
15 | 


--------------------------------------------------------------------------------
/data/tabular/aminoacids/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: aminoacids
 2 | description: |-
 3 |   The list of the 20 essential aminoacids, their SMILES, one letter and three letter codes.
 4 | targets:
 5 |   - id: three_letter_code
 6 |     description: three-letter code
 7 |     type: text
 8 |   - id: one_letter_code
 9 |     description: one-letter code
10 |     type: text
11 |   - id: aminoacid_name
12 |     description: name
13 |     type: text
14 |   - id: type
15 |     description: type of aminoacid
16 |     type: text
17 | identifiers:
18 |   - id: SMILES
19 |     type: SMILES
20 |     description: SMILES
21 | license: CC BY 4.0
22 | links:
23 |   - url: https://chemistry.stackexchange.com/questions/138614/why-are-tyrosine-and-tryptophan-considered-hydrophobic
24 |     description: reference for amino acid type
25 | num_points: 20
26 | templates:
27 |   - The {#essential amino acid|amino acid|amino acid (AA)|AA!} with the {SMILES__description} {SMILES#} has the one-letter code {one_letter_code#} and the three-letter code {three_letter_code#}.
28 |   - The {#essential amino acid|amino acid|amino acid (AA)|AA!} {aminoacid_name#} has the one-letter code {one_letter_code#} and the three-letter code {three_letter_code#}.
29 |   - |-
30 |     Question: What is the one-letter code of the {#essential amino acid|amino acid|amino acid (AA)|AA!} with the {SMILES__description} {SMILES#}?
31 |     Answer: {one_letter_code#}.
32 |   - |-
33 |     Question: What is the three-letter code of the {#essential amino acid|amino acid|amino acid (AA)|AA!} with the {SMILES__description} {SMILES#}?
34 |     Answer: {three_letter_code#}.
35 |   - |-
36 |     Question: What is the type of the amino acid with the one-letter code {one_letter_code#} and {SMILES__description} {SMILES#}?
37 |     Constraint: The possible types are: polar, non-polar, positively charged, negatively charged.
38 |     Answer: From the provided amino acid types (polar, non-polar, positively charged, negatively charged), the amino acid with the one-letter code {one_letter_code#} is {type#}.
39 | 


--------------------------------------------------------------------------------
/data/tabular/aminoacids/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def extract_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/uniprot", filename="aminoacid_seq.csv", repo_type="dataset"
 8 |     )
 9 |     aminoacids = pd.read_csv(file)
10 |     aminoacids.to_csv("data_clean.csv", index=False)
11 |     return aminoacids
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     extract_data()
16 | 


--------------------------------------------------------------------------------
/data/tabular/bc5chem/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: bc5chem
 2 | description: |-
 3 |   BC5CHEM is a named entity recognition dataset for chemical mentions.
 4 | targets:
 5 |   - id: matched_words
 6 |     description: matched words
 7 |     type: text
 8 |     names:
 9 |       - noun: entity
10 |       - noun: matched entity
11 | identifiers:
12 |   - id: sentence
13 |     description: Sentence
14 |     type: text
15 |     names:
16 |       - noun: sentence
17 |       - noun: text
18 | license: https://huggingface.co/datasets/bigbio/blurb/blob/main/LICENSE
19 | links:
20 |   - url: https://huggingface.co/datasets/bigbio/blurb
21 |     description: original dataset
22 | benchmarks:
23 |   - name: bc5chem
24 |     link: hhttps://huggingface.co/datasets/bigbio/blurb
25 |     split_column: split
26 | num_points: 13755
27 | bibtex:
28 |   - |-
29 |     @article{gu2021domain,
30 |         title = {
31 |             Domain-specific language model pretraining for biomedical natural
32 |             language processing
33 |         },
34 |         author = {
35 |             Gu, Yu and Tinn, Robert and Cheng, Hao and Lucas, Michael and
36 |             Usuyama, Naoto and Liu, Xiaodong and Naumann, Tristan and Gao,
37 |             Jianfeng and Poon, Hoifung
38 |         },
39 |         year = 2021,
40 |         journal = {ACM Transactions on Computing for Healthcare (HEALTH)},
41 |         publisher = {ACM New York, NY},
42 |         volume = 3,
43 |         number = 1,
44 |         pages = {1--23}
45 |     }
46 | templates:
47 |   - |-
48 |     Task: Find all the mentions of {#chemicals|chemical compounds|chemical substances!} in the {#following|subsequent!} {#text|sentence!}. Return the matching {#words|entities!}. If there is no {#match|mention of a chemical|matching entity!}, return `no match`.
49 |     {#Sentence|Description!}: {sentence#}
50 |     Answer: {matched_words#}
51 |   - |-
52 |     User: Does the following text contain mentions of {#chemicals|chemical compounds|chemical substances!}?{# Can you return matches?| Can you output matches?| Please return matches.!}
53 |     {#Text: |!}{sentence#}
54 |     Assistant: {#I found|There is!} {matched_words#}.
55 | 


--------------------------------------------------------------------------------
/data/tabular/bc5chem/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datasets import load_dataset
 3 | 
 4 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner
 5 | from chemnlp.data.utils import oxford_comma_join
 6 | 
 7 | 
 8 | def process():
 9 |     # tokenized at whitespaces and punctuations
10 |     dataset = load_dataset("bigbio/blurb", "bc5chem")
11 |     dfs = []
12 |     for split in ["train", "validation", "test"]:
13 |         df_ = dataset[split].to_pandas()
14 |         df_["split"] = split
15 |         dfs.append(df_)
16 |     df = pd.concat(dfs)
17 |     ner_labels = df["ner_tags"]
18 | 
19 |     matched_words = []
20 |     for tokens, ner_label in zip(df["tokens"], ner_labels):
21 |         words = group_tokens_by_labels(tokens, ner_label)
22 |         if len(words) == 0:
23 |             matched_words.append("no match")
24 |         else:
25 |             matched_words.append(oxford_comma_join(words))
26 | 
27 |     df["matched_words"] = matched_words
28 |     df["sentence"] = df["tokens"].apply(punctuation_joiner)
29 | 
30 |     df = df[["sentence", "matched_words"]]
31 | 
32 |     # ensure we have at least 5 words in a sentence
33 |     df = df[df["sentence"].apply(lambda x: len(x.split()) >= 5)]
34 | 
35 |     print(len(df))
36 |     df.to_csv("data_clean.csv", index=False)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     process()
41 | 


--------------------------------------------------------------------------------
/data/tabular/bc5disease/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datasets import load_dataset
 3 | 
 4 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner
 5 | from chemnlp.data.utils import oxford_comma_join
 6 | 
 7 | 
 8 | def process():
 9 |     # tokenized at whitespaces and punctuations
10 |     dataset = load_dataset("bigbio/blurb", "bc5disease")
11 |     dfs = []
12 |     for split in ["train", "validation", "test"]:
13 |         df_ = dataset[split].to_pandas()
14 |         df_["split"] = split
15 |         dfs.append(df_)
16 |     df = pd.concat(dfs)
17 |     ner_labels = df["ner_tags"]
18 | 
19 |     matched_words = []
20 |     for tokens, ner_label in zip(df["tokens"], ner_labels):
21 |         words = group_tokens_by_labels(tokens, ner_label)
22 |         if len(words) == 0:
23 |             matched_words.append("no match")
24 |         else:
25 |             matched_words.append(oxford_comma_join(words))
26 | 
27 |     df["matched_words"] = matched_words
28 |     df["sentence"] = df["tokens"].apply(punctuation_joiner)
29 | 
30 |     df = df[["sentence", "matched_words"]]
31 | 
32 |     # ensure we have at least 5 words in a sentence
33 |     df = df[df["sentence"].apply(lambda x: len(x.split()) >= 5)]
34 | 
35 |     print(len(df))
36 |     df.to_csv("data_clean.csv", index=False)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     process()
41 | 


--------------------------------------------------------------------------------
/data/tabular/bicerano_dataset/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from canonicalize_psmiles.canonicalize import canonicalize
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | 
 6 | def transform_data():
 7 |     file = hf_hub_download(
 8 |         repo_id="chemNLP/bicerano_polymers",
 9 |         filename="HT_MD_polymer_properties.csv",
10 |         repo_type="dataset",
11 |     )
12 |     original_data = pd.read_csv(file)
13 |     clean_data = original_data.drop("sl_num", axis=1)
14 | 
15 |     assert not clean_data.duplicated().sum()
16 | 
17 |     clean_columns = [
18 |         "compound_name",
19 |         "PSMILES",
20 |         "Tg_exp",
21 |         "Tg_calc",
22 |         "Tg_calc_std",
23 |         "rho_300K_exp",
24 |         "rho_300K_calc",
25 |         "rho_300K_calc_std",
26 |         "glass_CTE_calc",
27 |         "glass_CTE_calc_std",
28 |         "rubber_CTE_calc",
29 |         "rubber_CTE_calc_std",
30 |     ]
31 | 
32 |     clean_data.columns = clean_columns
33 | 
34 |     clean_data["compound_name"] = clean_data["compound_name"].str.strip()
35 | 
36 |     clean_data["PSMILES"] = clean_data["PSMILES"].str.replace(
37 |         "[Ce]", "[*]", regex=False
38 |     )
39 |     clean_data["PSMILES"] = clean_data["PSMILES"].str.replace(
40 |         "[Th]", "[*]", regex=False
41 |     )
42 |     clean_data["PSMILES"] = clean_data["PSMILES"].str.replace(
43 |         "[[*]]", "[*]", regex=False
44 |     )
45 | 
46 |     clean_data["PSMILES"] = clean_data["PSMILES"].apply(
47 |         lambda smiles: canonicalize(smiles)
48 |     )
49 | 
50 |     clean_data.to_csv("data_clean.csv")
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     transform_data()
55 | 


--------------------------------------------------------------------------------
/data/tabular/bio_ner/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: bio_ner
 2 | description: NER task on bio-related text.
 3 | identifiers:
 4 |   - id: Sentence
 5 |     description: Sentence
 6 |     type: Other
 7 | targets:
 8 |   - id: entity_1
 9 |     description: entity_1
10 |     type: Other
11 |     units: entity_1
12 |     names:
13 |       - noun: entity_1
14 |   - id: json
15 |     description: json
16 |     type: Other
17 |     units:
18 |     names:
19 |       - noun: JSON output
20 | benchmarks:
21 |   - name: bio_ner
22 |     link: https://github.com/ML4LitS/bio-datasets
23 |     split_column: split
24 | license: unknown
25 | links:
26 |   - url: https://github.com/ML4LitS/bio-datasets
27 |     description: ???
28 | num_points: 123509
29 | bibtex:
30 |   - ???
31 | templates:
32 |   - |-
33 |     Task: Please carry out the {#named entity recognition (NER)|named entity recognition|NER!} task for the the text below.
34 |     Text: {Sentence#}.
35 |     Constrain: Please, {#only |!}list the entities in the form NER entity, span start, span end, and type {#in separate lines |!}with a high probability of being in the text.
36 |     Result: {entity_1#}
37 | 


--------------------------------------------------------------------------------
/data/tabular/block_polymers_morphology/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | columns_to_keep = ["phase1", "T", "BigSMILES", "Mn", "f1", "Mw", "D"]
 4 | 
 5 | 
 6 | def process():
 7 |     df = pd.read_csv(
 8 |         "https://raw.githubusercontent.com/olsenlabmit/BCDB/main/data/diblock.csv"
 9 |     )
10 |     df = df[df["phase2"].isna()]  # remove multiple phases
11 |     mw_clean = []
12 |     dispersity_clean = []
13 | 
14 |     for mw, dispersity in zip(df["Mw"], df["D"]):
15 |         # if nan, make empty string
16 |         # else, add the units
17 |         if pd.isna(mw) or "nan" in str(mw):
18 |             mw_clean.append("REPLACENULL")
19 |         else:
20 |             mw_clean.append(f", average molecular mass of {mw:.1f} g/mol")
21 | 
22 |         if pd.isna(dispersity) or "nan" in str(dispersity):
23 |             # empty character that will still appear in the csv
24 |             dispersity_clean.append("REPLACENULL")
25 |         else:
26 |             dispersity_clean.append(f", and dispersity of {dispersity:.1f}")
27 | 
28 |     df["Mw"] = mw_clean
29 |     df["D"] = dispersity_clean
30 |     df.dropna(subset=columns_to_keep, inplace=True)
31 |     print(len(df))
32 |     df[columns_to_keep].to_csv("data_clean.csv", index=False)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     process()
37 | 


--------------------------------------------------------------------------------
/data/tabular/caco2_wang/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: caco2_wang
 2 | description: |-
 3 |   The human colon epithelial cancer cell line, Caco-2,
 4 |   is used as an in vitro model to simulate the human intestinal tissue.
 5 |   The experimental result on the rate of drug passing through
 6 |   the Caco-2 cells can approximate the rate at which the drug permeates
 7 |   through the human intestinal tissue.
 8 | targets:
 9 |   - id: permeability
10 |     description: Caco-2 cell effective permeability.
11 |     units: cm/s
12 |     type: continuous
13 |     names:
14 |       - noun: Caco-2 cell effective permeability
15 |       - noun: Caco-2 cell permeability
16 |       - noun: Caco-2 permeability
17 |     pubchem_aids:
18 |       - 678378
19 |     uris:
20 |       - http://www.bioassayontology.org/bao#BAO_0010008
21 |       - http://purl.obolibrary.org/obo/MI_2162
22 | benchmarks:
23 |   - name: TDC
24 |     link: https://tdcommons.ai/
25 |     split_column: split
26 | identifiers:
27 |   - id: SMILES
28 |     type: SMILES
29 |     description: SMILES
30 |   - id: compound_name
31 |     type: Other
32 |     description: compound name
33 |     names:
34 |       - noun: compound
35 |       - noun: compound name
36 | license: CC BY 4.0
37 | links:
38 |   - url: https://tdcommons.ai/single_pred_tasks/adme/#caco-2-cell-effective-permeability-wang-et-al
39 |     description: original data set link
40 |   - url: https://pubs.acs.org/doi/10.1021/acs.jcim.5b00642
41 |     description: corresponding publication
42 | num_points: 910
43 | bibtex:
44 |   - |-
45 |     @article{wang2016adme,
46 |     title={ADME properties evaluation in drug discovery: prediction of Caco-2 cell permeability
47 |     using a combination of NSGA-II and boosting},
48 |     author={Wang, Ning-Ning and Dong, Jie and Deng, Yin-Hua and Zhu, Min-Feng and Wen, Ming and Yao,
49 |     Zhi-Jiang and Lu, Ai-Ping and Wang, Jian-Bing and Cao, Dong-Sheng},
50 |     journal={Journal of Chemical Information and Modeling},
51 |     volume={56},
52 |     number={4},
53 |     pages={763--773},
54 |     year={2016},
55 |     publisher={ACS Publications}
56 |     }
57 | 


--------------------------------------------------------------------------------
/data/tabular/chem_caption_smarts/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: chem_caption_smarts
 2 | description: |-
 3 |   This dataset contains the count of substructures in molecules
 4 | targets:
 5 |   - id: smarts
 6 |     type: text
 7 |     description: substructure smarts
 8 |     names:
 9 |       - noun: SMARTS
10 |       - noun: SMiles ARbitrary Target Specification (SMARTS)
11 |   - id: completion
12 |     type: categorical
13 |     description: number of matches
14 |   - id: completion_labels
15 |     type: text
16 |     description: name of the substructure
17 | identifiers:
18 |   - id: representation
19 |     type: text
20 |     description: representation
21 |   - id: representation_type
22 |     type: text
23 |     description: representation type
24 | license: CC BY 4.0
25 | links:
26 |   - url: https://github.com/lamalab-org/chem-caption
27 |     description: Original codebase used to generate this dataset
28 | templates:
29 |   - |-
30 |     Question: {#How many times|How often!} does the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contain the substructure with the {smarts__names__noun} {#smarts#}?
31 |     Answer: {completion#}
32 |   - |-
33 |     Question: {#How many times|How often!} does the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contain a {completion#} substructure?
34 |     Answer: {smarts__names__noun} {#smarts#}
35 |   - |-
36 |     User: {#I want to|I have to|I must|I would like to!} know {#how many times|how often!} the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#}.
37 |     Assistant: The {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times.
38 |   - |-
39 |     User: {#I want to|I have to|I must|I would like to!} know how many times the {#molecule|chemical|compound|chemical structure!} with {representation_type#} {representation#} contains a {completion#} substructure.
40 |     Assistant: The {#molecule|chemical|compound|chemical structure!} contains the substructure with the {smarts__names__noun} {#smarts#} {completion#} times.
41 | 


--------------------------------------------------------------------------------
/data/tabular/chem_caption_smarts/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def process():
 5 |     # get the smarts config
 6 |     df = pd.read_parquet(
 7 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/smarts/train-00000-of-00001-71cef18c6383b463.parquet"  # noqa
 8 |     )
 9 |     df["completion_labels"] = df["completion_labels"].astype(str)
10 |     df["completion_labels"] = df["completion_labels"].str.replace(
11 |         "_count", "", regex=True
12 |     )
13 |     df.to_csv("data_clean.csv", index=False)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     process()
18 | 


--------------------------------------------------------------------------------
/data/tabular/chembl_v29/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: chembl_v29
 2 | description: |-
 3 |   ChEMBL is a manually curated database of bioactive molecules with drug-like properties.
 4 |   It brings together chemical, bioactivity and genomic data
 5 |   to aid the translation of genomic information into effective new drugs.
 6 | benchmarks:
 7 |   - name: TDC
 8 |     link: https://tdcommons.ai/
 9 |     split_column: split
10 | identifiers:
11 |   - id: SMILES
12 |     type: SMILES
13 |     description: SMILES
14 | license: CC BY-SA 3.0
15 | links:
16 |   - url: https://academic.oup.com/nar/article/47/D1/D930/5162468
17 |     description: Article about original dataset
18 |   - url: https://academic.oup.com/nar/article/43/W1/W612/2467881
19 |     description: Exemplary related article shown in tdc's website
20 | num_points: 2084637
21 | bibtex:
22 |   - |-
23 |     @article{10.1093/nar/gky1075,
24 |     author = {Mendez, David and Gaulton, Anna and Bento, A Patricia and Chambers, Jon and De Veij,
25 |     Marleen and Felix, Eloy and Magarinos, Maria Paula and Mosquera,
26 |     Juan F and Mutowo, Prudence and Nowotka, Michal and Gordillo-Maranon,
27 |     Maria and Hunter, Fiona and Junco, Laura and Mugumbate, Grace and Rodriguez-Lopez, Milagros and Atkinson,
28 |     Francis and Bosc, Nicolas and Radoux, Chris J and Segura-Cabrera, Aldo and Hersey, Anne and Leach, Andrew R},
29 |     title = {ChEMBL: towards direct deposition of bioassay data},
30 |     journal = {Nucleic Acids Research},
31 |     volume = {47},
32 |     number = {D1},
33 |     pages = {D930-D940},
34 |     year = {2018},
35 |     month = {11},
36 |     abstract = "{ChEMBL is a large, open-access bioactivity database
37 |     (https://www.ebi.ac.uk/chembl), previously described in the 2012,
38 |     2014 and 2017 Nucleic Acids Research Database Issues.
39 |     In the last two years, several important improvements have been made to the database and are described here.
40 |     These include more robust capture and representation of assay details;
41 |     a new data deposition system, allowing updating of data sets and deposition of supplementary data;
42 |     and a completely redesigned web interface, with enhanced search and filtering capabilities.}",
43 |     issn = {0305-1048},
44 |     doi = {10.1093/nar/gky1075},
45 |     url = {https://doi.org/10.1093/nar/gky1075},
46 |     eprint = {https://academic.oup.com/nar/article-pdf/47/D1/D930/27437436/gky1075.pdf},
47 |     }
48 | 


--------------------------------------------------------------------------------
/data/tabular/chemcaption_fragments/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def process():
 5 |     df = pd.read_parquet(
 6 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/smarts/train-00000-of-00001-71cef18c6383b463.parquet?download=true"  # noqa
 7 |     )
 8 |     df.dropna(inplace=True)
 9 |     print(len(df))
10 |     df["fragment"] = df["completion_labels"].str.replace("_count", "")
11 |     df["presence"] = df["completion"] > 0
12 |     df["molecule"] = df["representation"]
13 |     df.to_csv("data_clean.csv", index=False)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     process()
18 | 


--------------------------------------------------------------------------------
/data/tabular/chemcaption_rdkit/transform.py:
--------------------------------------------------------------------------------
 1 | import fire
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def process():
 6 |     df = pd.read_parquet(
 7 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-chem-caption/resolve/main/rdkit_feat/train-00000-of-00001-7cea16ab26bf74cf.parquet?download=true"  # noqa
 8 |     )
 9 |     df["num_bonds_simple"] = df[
10 |         [
11 |             "num_single_bonds",
12 |             "num_double_bonds",
13 |             "num_triple_bonds",
14 |             "num_quadruple_bonds",
15 |             "num_quintuple_bonds",
16 |             "num_aromatic_bonds",
17 |         ]
18 |     ].sum(axis=1)
19 | 
20 |     df = df[df["num_bonds_simple"].astype(int) == df["num_bonds"].astype(int)]
21 | 
22 |     df[
23 |         [
24 |             "num_valence_electrons",
25 |             "num_single_bonds",
26 |             "num_double_bonds",
27 |             "num_triple_bonds",
28 |             "num_quadruple_bonds",
29 |             "num_quintuple_bonds",
30 |             "num_aromatic_bonds",
31 |             "num_bonds",
32 |             "num_carbon_atoms",
33 |             "num_hydrogen_atoms",
34 |             "num_nitrogen_atoms",
35 |             "num_oxygen_atoms",
36 |             "num_hydrogen_bond_acceptors",
37 |             "num_hydrogen_bond_donors",
38 |             "num_lipinski_violations",
39 |             "num_chiral_centers",
40 |         ]
41 |     ] = df[
42 |         [
43 |             "num_valence_electrons",
44 |             "num_single_bonds",
45 |             "num_double_bonds",
46 |             "num_triple_bonds",
47 |             "num_quadruple_bonds",
48 |             "num_quintuple_bonds",
49 |             "num_aromatic_bonds",
50 |             "num_bonds",
51 |             "num_carbon_atoms",
52 |             "num_hydrogen_atoms",
53 |             "num_nitrogen_atoms",
54 |             "num_oxygen_atoms",
55 |             "num_hydrogen_bond_acceptors",
56 |             "num_hydrogen_bond_donors",
57 |             "num_lipinski_violations",
58 |             "num_chiral_centers",
59 |         ]
60 |     ].astype(
61 |         int
62 |     )
63 |     print(len(df))
64 |     df.to_csv("data_clean.csv", index=False)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     fire.Fire(process)
69 | 


--------------------------------------------------------------------------------
/data/tabular/chemdner/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | from chemnlp.data.utils import oxford_comma_join
 4 | 
 5 | 
 6 | def process():
 7 |     dataset = load_dataset("kjappelbaum/chemnlp-chemdner")
 8 |     df = dataset["train"].to_pandas()
 9 | 
10 |     matched_words = []
11 |     for ent in df["entities"]:
12 |         if len(ent) == 0:
13 |             matched_words.append("no match")
14 |         else:
15 |             matched_words.append(oxford_comma_join(ent))
16 | 
17 |     df["matched_words"] = matched_words
18 |     df["sentence"] = df["text"]
19 | 
20 |     print(len(df))
21 | 
22 |     df = df[["sentence", "matched_words"]]
23 |     df.to_csv("data_clean.csv", index=False)
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     process()
28 | 


--------------------------------------------------------------------------------
/data/tabular/chemistry_stackexchange/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: chemistry_stackexchange
 2 | description: |-
 3 |   Questions and answers mined from chemistry.stackexchange.com.
 4 | targets:
 5 |   - id: a
 6 |     description: answer to the question
 7 |     type: string
 8 |   - id: title
 9 |     description: title of the question
10 |     type: string
11 | identifiers:
12 |   - id: q
13 |     type: string
14 |     description: question asked on chemistry.stackexchange.com
15 | license: CC BY-SA
16 | links:
17 |   - url: chemistry.stackexchange.com
18 |     description: original data source
19 |   - url: https://stackoverflow.com/help/licensing
20 |     description: information about the license
21 | num_points: 4582
22 | templates:
23 |   - |-
24 |     {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!}
25 |     {#User: |Question: |Inquiry: |\n!}{#q}
26 |     {#Assistant: |Answer: !}{#a}
27 |   - |-
28 |     {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!}
29 |     {#Question: |Inquiry: |\n!}{#q}
30 |     {#Assistant: |Title: |Answer: |!}{#title}
31 | 


--------------------------------------------------------------------------------
/data/tabular/core_mof_no_topo/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def process():
 6 |     file = hf_hub_download(
 7 |         repo_id="kjappelbaum/chemnlp-core-mof",
 8 |         filename="core_mofid.json",
 9 |         repo_type="dataset",
10 |     )
11 |     df = pd.read_json(file)
12 |     df = df.query("is_longer_than_allowed==False").dropna(
13 |         subset=[
14 |             "outputs.pure_CO2_kH",
15 |             "outputs.pure_CO2_widomHOA",
16 |             "outputs.pure_methane_kH",
17 |             "outputs.pure_methane_widomHOA",
18 |             "outputs.pure_uptake_CO2_298.00_15000",
19 |             "outputs.pure_uptake_CO2_298.00_1600000",
20 |             "outputs.pure_uptake_methane_298.00_580000",
21 |             "outputs.pure_uptake_methane_298.00_6500000",
22 |             "outputs.logKH_CO2",
23 |             "outputs.logKH_CH4",
24 |             "outputs.CH4DC",
25 |             "outputs.CH4HPSTP",
26 |             "outputs.CH4LPSTP",
27 |             "smiles_linkers",
28 |             "smiles_nodes",
29 |         ]
30 |     )
31 | 
32 |     print(len(df))
33 | 
34 |     df["smiles_linkers"] = df["smiles_linkers"].apply(lambda x: ", ".join(x))
35 |     df["smiles_nodes"] = df["smiles_nodes"].apply(lambda x: ", ".join(x))
36 | 
37 |     df[
38 |         [
39 |             "outputs.pure_CO2_kH",
40 |             "outputs.pure_CO2_widomHOA",
41 |             "outputs.pure_methane_kH",
42 |             "outputs.pure_methane_widomHOA",
43 |             "outputs.pure_uptake_CO2_298.00_15000",
44 |             "outputs.pure_uptake_CO2_298.00_1600000",
45 |             "outputs.pure_uptake_methane_298.00_580000",
46 |             "outputs.pure_uptake_methane_298.00_6500000",
47 |             "outputs.logKH_CO2",
48 |             "outputs.logKH_CH4",
49 |             "outputs.CH4DC",
50 |             "outputs.CH4HPSTP",
51 |             "outputs.CH4LPSTP",
52 |             "smiles_linkers",
53 |             "smiles_nodes",
54 |             "cif",
55 |         ]
56 |     ].to_csv("data_clean.csv", index=False)
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     process()
61 | 


--------------------------------------------------------------------------------
/data/tabular/drugchat_liang_zhang_et_al/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import concatenate_datasets, load_dataset
 2 | 
 3 | PUBCHEM_DATASET = "alxfgh/PubChem_Drug_Instruction_Tuning"
 4 | CHEMBL_DATASET = "alxfgh/ChEMBL_Drug_Instruction_Tuning"
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     # Load the two datasets
 9 |     dataset1 = load_dataset(PUBCHEM_DATASET)
10 |     dataset2 = load_dataset(CHEMBL_DATASET)
11 | 
12 |     # Verify that the datasets have the same schema (i.e., the same fields)
13 |     assert (
14 |         dataset1["train"].features == dataset2["train"].features
15 |     ), "Datasets do not have the same schema"
16 | 
17 |     # Concatenate the 'train' split of dataset2 to the 'train' split of dataset1
18 |     combined_dataset = concatenate_datasets([dataset1["train"], dataset2["train"]])
19 | 
20 |     # Define the fractions for train/test/valid split
21 |     train_fraction = 0.8
22 |     test_fraction = 0.1
23 |     # The remaining part will be the validation fraction
24 | 
25 |     # Generate the train/test/valid splits
26 |     train_test_valid_datasets = combined_dataset.train_test_split(
27 |         test_size=test_fraction, shuffle=True
28 |     )
29 |     train_valid_datasets = train_test_valid_datasets["train"].train_test_split(
30 |         test_size=(1 - train_fraction) / (1 - test_fraction), shuffle=True
31 |     )
32 | 
33 |     final_datasets = {
34 |         "train": train_valid_datasets["train"],
35 |         "test": train_test_valid_datasets["test"],
36 |         "valid": train_valid_datasets["test"],
37 |     }
38 | 
39 |     # Add the 'split' column to each dataset
40 |     for split in final_datasets:
41 |         final_datasets[split] = final_datasets[split].add_column(
42 |             "split", [split] * len(final_datasets[split])
43 |         )
44 | 
45 |     # Concatenate all splits again
46 |     all_datasets = concatenate_datasets(
47 |         [final_datasets[split] for split in final_datasets]
48 |     )
49 |     df = all_datasets.to_pandas()
50 | 
51 |     df.rename(columns={"Answer": "answ", "Question": "quest"}, inplace=True)
52 | 
53 |     # Save the combined dataset as a CSV file
54 |     df.to_csv("data_clean.csv", index=False)
55 | 


--------------------------------------------------------------------------------
/data/tabular/fda_adverse_reactions/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: fda_adverse_reactions
 2 | description: A dataset of adverse reaction statistics for drugs and reaction events.
 3 | targets:
 4 |   - id: count
 5 |     description: A count of how many reaction events occurred for this chembl id.
 6 |     units:
 7 |     type: ordinal
 8 |     names:
 9 |       - noun: adverse reaction frequency
10 |     pubchem_aids: []
11 |     uris: []
12 |   - id: event
13 |     description: The type of event that occurred for this molecule interaction.
14 |     units:
15 |     type: string
16 |     names:
17 |       - noun: adverse event reaction
18 |     pubchem_aids: []
19 |     uris: []
20 | identifiers:
21 |   - id: SMILES
22 |     type: SMILES
23 |     description: This is the SMILES identifier for a given molecule.
24 | license: CC BY-SA 3.0
25 | links:
26 |   - name: Dataset
27 |     url: https://platform.opentargets.org/downloads
28 |     description: The website which we download the dataset from during the transformation script.
29 | benchmarks: []
30 | num_points: 94910
31 | bibtex: []
32 | 


--------------------------------------------------------------------------------
/data/tabular/flashpoint/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: flashpoint
 2 | description: |
 3 |   Curation of experimentally determined flash point values measured with open cup and closed cup methods.
 4 |   The values are from academic papers, the Hazardous Chemicals Handbook, and the PubChem chemical database.
 5 |   Differences from the stated sources in the paper are:
 6 |   * Values from the DIPPR database are not included in their dataset as they are proprietary.
 7 |   * There are appear to be no values from Lange's handbook of chemistry in their dataset.
 8 |   * We did our own processing to resolve duplicate SMILES.
 9 | targets:
10 |   - id: flashpoint
11 |     description: Experimental flash point value (K)
12 |     units: K
13 |     type: continuous
14 |     names:
15 |       - noun: flash point
16 |     uris:
17 |       - http://semanticscience.org/resource/CHEMINF_000417
18 | identifiers:
19 |   - id: SMILES
20 |     type: SMILES
21 |     description: SMILES
22 | license: CC BY 4.0
23 | num_points: 9878 # downloaded dataset has 14696 datapoints, but there are duplicate smiles
24 | links:
25 |   - url: https://figshare.com/ndownloader/files/18509711
26 |     description: Original figshare dataset
27 | bibtex:
28 |   - |
29 |     "@article{sun2020assessing,
30 |     title={Assessing Graph-based Deep Learning Models for Predicting Flash Point},
31 |     author={Sun, Xiaoyu and Krakauer, Nathaniel J and Politowicz, Alexander and Chen, Wei-Ting and Li, Qiying and Li, Zuoyi and Shao, Xianjia and Sunaryo, Alfred and Shen, Mingren and Wang, James and others},
32 |     journal={Molecular informatics},
33 |     volume={39},
34 |     number={6},
35 |     pages={e1900101},
36 |     year={2020}
37 |     }"
38 | 


--------------------------------------------------------------------------------
/data/tabular/formation_energies/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def process():
 5 |     df = pd.read_json(
 6 |         "https://raw.githubusercontent.com/CJBartel/TestStabilityML/master/mlstabilitytest/mp_data/data/hullout.json"
 7 |     )
 8 | 
 9 |     df = df.T.reset_index().rename(columns={"index": "composition"})
10 |     df["rxn"] = df["rxn"].str.replace("_", " ")
11 |     df.dropna(subset=["rxn", "Ef", "Ed"], inplace=True)
12 |     df["Ef"] = df["Ef"].astype(float).round(3)
13 |     df["Ed"] = df["Ed"].astype(float).round(3)
14 |     print(len(df))
15 |     df.to_csv("data_clean.csv", index=False)
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     process()
20 | 


--------------------------------------------------------------------------------
/data/tabular/h2_storage_materials/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: h2_storage_reversible_hydrides
 2 | description: synthetic procedures, experimental and theoretical h2 capacities of hydrides
 3 | targets:
 4 |   - id: h_weight_density_theory
 5 |     description: theoretical hydrogen storage capacity
 6 |     units: wt%
 7 |     type: continuous
 8 |     names:
 9 |       - noun: theoretical hydrogen storage weight density
10 |   - id: h_weight_density_experiment
11 |     description: experimental hydrogen storage capacity
12 |     units: wt%
13 |     type: continuous
14 |     names:
15 |       - noun: experimental hydrogen storage capacity
16 | identifiers:
17 |   - id: material_name
18 |     type: IUPAC
19 |     description: chemical name
20 |   - id: chemical_formula
21 |     type: COMPOSITION
22 |     names:
23 |       - noun: chemical formula
24 |     description: chemical formula
25 |   - id: synthetic_information
26 |     names:
27 |       - noun: synthesis procedure summary
28 |     description: brief description of synthetic procedure
29 |     type: Other
30 | license: File
31 | links:
32 |   - url: https://datahub.hymarc.org/dataset/hydrogen-storage-materials-db/resource/4ef1c494-366e-43a3-bed4-a3985de5c374
33 |     description: website with source data
34 |   - url: https://datahub.hymarc.org/dataset/ad580d95-e7e2-4ef4-a7f6-3b2f91a96eba/resource/4ef1c494-366e-43a3-bed4-a3985de5c374/download/hydstormatdb-reversible_hydrides.csv
35 |     description: original_dataset
36 | num_points: 30
37 | bibtex:
38 |   - "@online{hymarcReversibleHydrides,\ntitle={Hydrogen Storage Materials Database Reversible Hydrides},\nauthor={HyMARC},\nyear={2019}"
39 | 


--------------------------------------------------------------------------------
/data/tabular/half_life_obach/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: half_life_obach
 2 | description: |-
 3 |   Half life of a drug is the duration for the concentration of the drug
 4 |   in the body to be reduced by half. It measures the duration of actions of a drug.
 5 |   This dataset deposited version under CHEMBL assay 1614674.
 6 | targets:
 7 |   - id: half_life_duration
 8 |     description: the time it takes for the plasma concentration of a drug in the body to be reduced by half
 9 |     units: hours
10 |     type: continuous
11 |     significant_digits: 2
12 |     names:
13 |       - noun: half life in humans after IV administration
14 |       - noun: half life time in humans after IV administration
15 |       - noun: drug half life time in humans after IV administration
16 |     uris:
17 |       - http://purl.bioontology.org/ontology/MESH/D006207
18 | benchmarks:
19 |   - name: TDC
20 |     link: https://tdcommons.ai/
21 |     split_column: split
22 | identifiers:
23 |   - id: SMILES
24 |     type: SMILES
25 |     description: SMILES
26 |   - id: chembl_id
27 |     type: Other
28 |     names:
29 |       - noun: ChEMBL database id
30 |       - noun: ChEMBL identifier number
31 |     description: ChEMBL ids
32 |     sample: false
33 | license: CC BY 4.0
34 | links:
35 |   - url: https://doi.org/10.1124/dmd.108.020479
36 |     description: corresponding publication
37 |   - url: https://tdcommons.ai/single_pred_tasks/adme/#half-life-obach-et-al
38 |     description: data source
39 | num_points: 667
40 | bibtex:
41 |   - |-
42 |     @article{Obach2008,
43 |     doi = {10.1124/dmd.108.020479},
44 |     url = {https://doi.org/10.1124/dmd.108.020479},
45 |     year = {2008},
46 |     month = apr,
47 |     publisher = {American Society for Pharmacology and Experimental Therapeutics (ASPET)},
48 |     volume = {36},
49 |     number = {7},
50 |     pages = {1385--1405},
51 |     author = {R. Scott Obach and Franco Lombardo and Nigel J. Waters},
52 |     title = {Trend Analysis of a Database of Intravenous Pharmacokinetic
53 |     Parameters in Humans for 670 Drug Compounds},
54 |     journal = {Drug Metabolism and Disposition}
55 | 


--------------------------------------------------------------------------------
/data/tabular/herg_central_at_10uM/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: herg_central_at_10uM
 2 | description: "Human ether-à-go-go related gene (hERG) is crucial for the coordination\nof the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe\nadverse effects. Therefore, reliable prediction of hERG liability in the early\nstages of drug design is quite important to reduce the risk of cardiotoxicity-related\nattritions in the later development stages. There are three targets: hERG_at_1microM,\nhERG_at_10microM, and herg_inhib."
 3 | targets:
 4 |   - id: herg_central_at_10uM
 5 |     description: the percent inhibition of hERG at a 10uM concentration
 6 |     units: "%"
 7 |     type: continuous
 8 |     names:
 9 |       - noun: hERG inhibition at a concentration of 10uM
10 |       - noun: hERG inhibition at a concentration of 10uM
11 |       - noun: hERG inhibition at 10uM
12 |       - noun: human ether-à-go-go related gene (hERG) inhibition at a concentration of 10uM
13 |       - noun: human ether-à-go-go related gene (hERG) inhibition at 10uM
14 |       - noun: human ether-à-go-go related gene (hERG) inhibition at 10uM
15 |     uris:
16 |       - http://purl.obolibrary.org/obo/MI_2136
17 | identifiers:
18 |   - id: SMILES
19 |     type: SMILES
20 |     description: SMILES
21 | license: CC BY 4.0
22 | links:
23 |   - url: https://doi.org/10.1089/adt.2011.0425
24 |     description: corresponding publication
25 |   - url: https://bbirnbaum.com/
26 |     description: TDC Contributer
27 |   - url: https://tdcommons.ai/single_pred_tasks/tox/#herg-central
28 |     description: Data source
29 | num_points: 306893
30 | bibtex:
31 |   - "@article{Du2011,\ndoi = {10.1089/adt.2011.0425},\nurl = {https://doi.org/10.1089/adt.2011.0425},\nyear = {2011},\nmonth = dec,\npublisher = {Mary Ann Liebert Inc},\nvolume = {9},\nnumber = {6},\npages = {580--588},\nauthor = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li},\ntitle = {hERGCentral: A Large Database to Store,  Retrieve,  and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development},\njournal = {ASSAY and Drug Development Technologies}"
32 | 


--------------------------------------------------------------------------------
/data/tabular/herg_central_at_1uM/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: herg_central_at_1uM
 2 | description: "Human ether-à-go-go related gene (hERG) is crucial for the coordination\nof the heart's beating. Thus, if a drug blocks the hERG, it could lead to severe\nadverse effects. Therefore, reliable prediction of hERG liability in the early\nstages of drug design is quite important to reduce the risk of cardiotoxicity-related\nattritions in the later development stages. There are three targets: hERG_at_1microM,\nhERG_at_10microM, and herg_inhib."
 3 | targets:
 4 |   - id: herg_central_at_1uM
 5 |     description: the percent inhibition of hERG at a 1uM concentration
 6 |     units: "%"
 7 |     type: continuous
 8 |     names:
 9 |       - noun: hERG inhibition at a concentration of 1uM
10 |       - noun: hERG inhibition at a concentration of 1uM
11 |       - noun: hERG inhibition at 1uM
12 |       - noun: human ether-à-go-go related gene (hERG) inhibition at a concentration of 1uM
13 |       - noun: human ether-à-go-go related gene (hERG) inhibition at 1uM
14 |       - noun: human ether-à-go-go related gene (hERG) inhibition at 1uM
15 |     uris:
16 |       - http://purl.obolibrary.org/obo/MI_2136
17 | identifiers:
18 |   - id: SMILES
19 |     type: SMILES
20 |     description: SMILES
21 | license: CC BY 4.0
22 | links:
23 |   - url: https://doi.org/10.1089/adt.2011.0425
24 |     description: corresponding publication
25 |   - url: https://bbirnbaum.com/
26 |     description: TDC Contributer
27 |   - url: https://tdcommons.ai/single_pred_tasks/tox/#herg-central
28 |     description: Data source
29 | num_points: 306893
30 | bibtex:
31 |   - "@article{Du2011,\ndoi = {10.1089/adt.2011.0425},\nurl = {https://doi.org/10.1089/adt.2011.0425},\nyear = {2011},\nmonth = dec,\npublisher = {Mary Ann Liebert Inc},\nvolume = {9},\nnumber = {6},\npages = {580--588},\nauthor = {Fang Du and Haibo Yu and Beiyan Zou and Joseph Babcock\nand Shunyou Long and Min Li},\ntitle = {hERGCentral: A Large Database to Store,  Retrieve,  and Analyze Compound Human\nEther-à-go-go Related Gene Channel Interactions to Facilitate Cardiotoxicity Assessment in Drug Development},\njournal = {ASSAY and Drug Development Technologies}"
32 | 


--------------------------------------------------------------------------------
/data/tabular/iupac_smiles/transform.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import fire
 4 | import pandas as pd
 5 | from datasets import load_dataset
 6 | 
 7 | 
 8 | def process(debug=False):
 9 |     if not os.path.exists("combined_json.jsonl"):
10 |         dataset = load_dataset("kjappelbaum/chemnlp_iupac_smiles")
11 |         df = pd.DataFrame(dataset["train"])
12 |     else:
13 |         file = "combined_json.jsonl"
14 |         df = pd.read_json(file, lines=True)
15 | 
16 |     df.drop_duplicates(subset=["SMILES"], inplace=True)
17 |     print(len(df))
18 | 
19 |     if debug:
20 |         df = df.sample(1000)
21 | 
22 |     df.to_csv("data_clean.csv", index=False)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     fire.Fire(process)
27 | 


--------------------------------------------------------------------------------
/data/tabular/ld50_catmos/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def process():
 6 |     file = hf_hub_download(
 7 |         repo_id="kjappelbaum/chemnlp-ld50catmos",
 8 |         filename="cleaned_ld50.csv",
 9 |         repo_type="dataset",
10 |     )
11 |     df = pd.read_csv(file)
12 |     print(len(df))
13 |     df[
14 |         [
15 |             "num_ghose_violations",
16 |             "num_lead_likeness_violations",
17 |             "num_lipinski_violations",
18 |             "num_carbon_atoms",
19 |             "num_oxygen_atoms",
20 |         ]
21 |     ] = df[
22 |         [
23 |             "num_ghose_violations",
24 |             "num_lead_likeness_violations",
25 |             "num_lipinski_violations",
26 |             "num_carbon_atoms",
27 |             "num_oxygen_atoms",
28 |         ]
29 |     ].astype(
30 |         int
31 |     )
32 |     df.to_csv("data_clean.csv", index=False)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     process()
37 | 


--------------------------------------------------------------------------------
/data/tabular/ld50_zhu/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: ld50_zhu
 2 | description: |-
 3 |   Acute toxicity LD50 measures
 4 |   the most conservative dose that can lead to lethal adverse effects.
 5 |   The higher the dose, the more lethal of a drug.
 6 | targets:
 7 |   - id: acute_toxicity
 8 |     description: Acute Toxicity LD50.
 9 |     units: log10(1/(mol/kg))
10 |     type: continuous
11 |     names:
12 |       - noun: acute oral toxicity rat LD50
13 |       - noun: acute oral toxicity (LD50 in rats)
14 |       - noun: LD50 in rats (oral exposure)
15 |       - noun: rat LD50 (oral exposure)
16 |     uris:
17 |       - http://www.bioassayontology.org/bao#BAO_0002117
18 | identifiers:
19 |   - id: SMILES
20 |     type: SMILES
21 |     description: SMILES
22 |   - id: compound_name
23 |     type: Other
24 |     description: compound name
25 |     names:
26 |       - noun: compound
27 |       - noun: compound name
28 |       - noun: drug
29 | license: CC BY 4.0
30 | links:
31 |   - url: https://doi.org/10.1021/tx900189p
32 |     description: corresponding publication
33 | benchmarks:
34 |   - name: TDC
35 |     link: https://tdcommons.ai/
36 |     split_column: split
37 | num_points: 7385
38 | bibtex:
39 |   - |-
40 |     @article{Zhu2009,
41 |     doi = {10.1021/tx900189p},
42 |     url = {https://doi.org/10.1021/tx900189p},
43 |     year = {2009},
44 |     month = oct,
45 |     publisher = {American Chemical Society ({ACS})},
46 |     volume = {22},
47 |     number = {12},
48 |     pages = {1913--1921},
49 |     author = {Hao Zhu and Todd M. Martin and Lin Ye and Alexander
50 |     Sedykh and Douglas M. Young and Alexander Tropsha},
51 |     title = {Quantitative Structure-Activity Relationship Modeling
52 |     of Rat Acute Toxicity by Oral Exposure},
53 |     journal = {Chemical Research in Toxicology}}
54 | 


--------------------------------------------------------------------------------
/data/tabular/mattermodeling_stackexchange/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: mattermodeling_stackexchange
 2 | description: |-
 3 |   Questions and answers mined from mattermodeling.stackexchange.com.
 4 | targets:
 5 |   - id: a
 6 |     description: answer to the question
 7 |     type: string
 8 |   - id: title
 9 |     description: title of the question
10 |     type: string
11 | identifiers:
12 |   - id: q
13 |     type: string
14 |     description: question asked on mattermodeling.stackexchange.com
15 | license: CC BY-SA
16 | links:
17 |   - url: mattermodeling.stackexchange.com
18 |     description: original data source
19 |   - url: https://stackoverflow.com/help/licensing
20 |     description: information about the license
21 | num_points: 664
22 | templates:
23 |   - |-
24 |     {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!}
25 |     {#User: |Question: |Inquiry: |\n!}{#q}
26 |     {#Assistant: |Answer: !}{#a}
27 |   - |-
28 |     {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!}
29 |     {#Question: |Inquiry: |\n!}{#q}
30 |     {#Assistant: |Title: |Answer: !}{#title}
31 | 


--------------------------------------------------------------------------------
/data/tabular/melting_points/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def preprocess():
 5 |     df = pd.read_csv(
 6 |         "https://www.dropbox.com/scl/fi/op8hf1zcl8cin4zb3qj0s/ochem_clean.csv?rlkey=j41m2z1jk7o9hupec19gaxov9&dl=1"
 7 |     )
 8 |     df = df.rename(columns={"Melting Point": "mp_range"})
 9 |     df.dropna(subset=["mp", "NAME", "SMILES", "mp_range"], inplace=True)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     preprocess()
15 | 


--------------------------------------------------------------------------------
/data/tabular/merge.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | from pathlib import Path
 4 | from typing import Union
 5 | 
 6 | import fire
 7 | import pandas as pd
 8 | from tqdm import tqdm
 9 | 
10 | 
11 | def merge_files(dir):
12 |     fns = sorted(glob(os.path.join(dir, "data_clean-*.csv")))
13 |     fn_merged = os.path.join(dir, "data_clean.csv")
14 |     if os.path.exists(fn_merged):
15 |         os.remove(fn_merged)
16 |     for fn in fns:
17 |         df = pd.read_csv(fn, index_col=False, low_memory=False)
18 |         df.to_csv(
19 |             fn_merged, mode="a", index=False, header=not os.path.exists(fn_merged)
20 |         )
21 |         os.remove(fn)
22 |         del df
23 | 
24 | 
25 | def process_file(file: Union[str, Path]):
26 |     dir = Path(file).parent
27 |     # check if there is any csv file
28 |     if not glob(os.path.join(dir, "*.csv")):
29 |         return
30 |     if len(glob(os.path.join(dir, "data_clean-0*.csv"))) >= 1:
31 |         merge_files(dir)
32 | 
33 | 
34 | def process_all_files(data_dir):
35 |     all_yaml_files = sorted(glob(os.path.join(data_dir, "**", "**", "meta.yaml")))
36 |     all_yaml_files = [f for f in all_yaml_files if "fda" in f]
37 |     print(all_yaml_files)
38 |     for yaml_file in tqdm(all_yaml_files):
39 |         print(f"Processing {yaml_file}")
40 |         try:
41 |             process_file(yaml_file)
42 |         except Exception as e:
43 |             print(f"Could not process {yaml_file}: {e}")
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     fire.Fire(process_all_files)
48 | 


--------------------------------------------------------------------------------
/data/tabular/mofdscribe/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: mofdscribe
 2 | description: |-
 3 |   Text descriptions of MOF structures.
 4 | targets:
 5 |   - id: description
 6 |     description: description
 7 |     type: text
 8 |     names:
 9 |       - noun: description
10 | benchmarks: []
11 | identifiers:
12 |   - id: cif
13 |     type: text
14 |     description: CIFFILE
15 |     names:
16 |       - noun: CIF file
17 |       - noun: Crystallographic Information File (CIF)
18 |       - noun: CIF card
19 | license: CC BY 4.0
20 | links:
21 |   - url: https://github.com/kjappelbaum/mofdscribe
22 |     description: codebase used to generate this dataset
23 | num_points: 1267
24 | bibtex:
25 |   - |-
26 |     @article{Jablonka_2023,
27 |       doi = {10.1021/acscentsci.2c01177},
28 |       url = {https://doi.org/10.1021%2Facscentsci.2c01177},
29 |       year = 2023,
30 |       month = {mar},
31 |       publisher = {American Chemical Society ({ACS})},
32 |       volume = {9},
33 |       number = {4},
34 |       pages = {563--581},
35 |       author = {Kevin Maik Jablonka and Andrew S. Rosen and Aditi S. Krishnapriyan and Berend Smit},
36 |       title = {An Ecosystem for Digital Reticular Chemistry},
37 |       journal = {ACS Cent. Sci.}
38 |     }
39 |   - |-
40 |     @article{Ganose_2019,
41 |       doi = {10.1557/mrc.2019.94},
42 |       url = {https://doi.org/10.1557%2Fmrc.2019.94},
43 |       year = 2019,
44 |       month = {sep},
45 |       publisher = {Springer Science and Business Media {LLC}},
46 |       volume = {9},
47 |       number = {3},
48 |       pages = {874--881},
49 |       author = {Alex M. Ganose and Anubhav Jain},
50 |       title = {Robocrystallographer: automated crystal structure text descriptions and analysis},
51 |       journal = {MRS Communications}
52 |     }
53 | templates:
54 |   - |-
55 |     Task: {#Describe|Write a description of!} the structure with the {cif__names__noun} {cif#}.
56 |     {#Answer: |A: |!}{description#}
57 |   - |-
58 |     Task: {#Create|Generate|Propose!} a {cif__names__noun} of a {#metal-organic framework|MOF|crystal structure|structure|material!} with the following description
59 |     {description#}.
60 |     {#Answer: |A: |!}{cif#}
61 | 


--------------------------------------------------------------------------------
/data/tabular/mofdscribe/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | from chemnlp.data.convert import remove_composition_rows
 5 | 
 6 | 
 7 | def process():
 8 |     file = hf_hub_download(
 9 |         repo_id="kjappelbaum/chemnlp-text-mofdscribe",
10 |         filename="data/train-00000-of-00001-ccae794e6d461778.parquet",
11 |         repo_type="dataset",
12 |     )
13 |     df = pd.read_parquet(file)
14 |     print(len(df))
15 |     df["cif"] = df["cif"].apply(remove_composition_rows)
16 |     df.to_csv("data_clean.csv", index=False)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     process()
21 | 


--------------------------------------------------------------------------------
/data/tabular/mol2svg/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: mol2svg
 2 | description: |-
 3 |   This dataset contains SVG images of molecules, including some with substructures
 4 |   highlighted.
 5 | targets:
 6 |   - id: completion
 7 |     type: text
 8 |     description: completion
 9 | identifiers:
10 |   - id: prompt
11 |     type: text
12 |     description: prompt
13 |   - id: SMILES
14 |     type: SMILES
15 |     description: SMILES
16 | license: CC BY 4.0
17 | num_points: 16019
18 | links:
19 |   - url: https://github.com/lamalab-org/chem-caption
20 |     description: Original codebase used to generate this dataset
21 | templates:
22 |   - |-
23 |     {prompt#}
24 |     {completion#}
25 | 


--------------------------------------------------------------------------------
/data/tabular/mol2svg/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def preprocess():
 5 |     dataset = load_dataset("kjappelbaum/chemnlp-mol-svg")
 6 |     df = dataset["train"].to_pandas()
 7 |     df.dropna(inplace=True)
 8 |     print(len(df))
 9 |     df.to_csv("data_clean.csv", index=False)
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     preprocess()
14 | 


--------------------------------------------------------------------------------
/data/tabular/moses/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: moses
 2 | description: |-
 3 |   Molecular Sets (MOSES) is a benchmark platform
 4 |   for distribution learning based molecule generation.
 5 |   Within this benchmark, MOSES provides a cleaned dataset of molecules that are ideal of optimization.
 6 |   It is processed from the ZINC Clean Leads dataset.
 7 | benchmarks:
 8 |   - name: TDC
 9 |     link: https://tdcommons.ai/
10 |     split_column: split
11 | identifiers:
12 |   - id: SMILES
13 |     type: SMILES
14 |     description: SMILES
15 | license: CC BY 4.0
16 | links:
17 |   - url: https://arxiv.org/abs/1811.12823
18 |     description: Article about original dataset
19 |   - url: https://pubs.acs.org/doi/abs/10.1021/acs.jcim.5b00559
20 |     description: Link to publication of associated dataset - zinc
21 |   - url: https://github.com/molecularsets/moses
22 |     description: Github repository concerning the dataset
23 | num_points: 1936962
24 | bibtex:
25 |   - |-
26 |     @article{10.3389/fphar.2020.565644,
27 |     title={{M}olecular {S}ets ({MOSES}): {A} {B}enchmarking {P}latform for {M}olecular {G}eneration {M}odels},
28 |     author={Polykovskiy, Daniil and Zhebrak, Alexander and Sanchez-Lengeling, Benjamin and Golovanov,
29 |     Sergey and Tatanov, Oktai and Belyaev, Stanislav and Kurbanov, Rauf and Artamonov,
30 |     Aleksey and Aladinskiy, Vladimir and Veselov, Mark and Kadurin, Artur and Johansson,
31 |     Simon and  Chen, Hongming and Nikolenko, Sergey and Aspuru-Guzik, Alan and Zhavoronkov, Alex},
32 |     journal={Frontiers in Pharmacology},
33 |     year={2020}
34 |     }
35 | 


--------------------------------------------------------------------------------
/data/tabular/mp_anisotropy/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def transform():
 5 |     dataset = load_dataset("kjappelbaum/chemnlp-mp-elastic-anisotropy")["train"]
 6 |     df = dataset.to_pandas()
 7 |     print(len(df))
 8 |     df[["formula", "elastic_anisotropy", "split"]].to_csv("data_clean.csv", index=False)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     transform()
13 | 


--------------------------------------------------------------------------------
/data/tabular/mp_bulk_modulus/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def transform():
 5 |     dataset = load_dataset("kjappelbaum/chemnlp-mp-bulk-modulus")["train"]
 6 |     df = dataset.to_pandas()
 7 |     print(len(df))
 8 |     df[["formula", "bulk_modulus", "split"]].to_csv("data_clean.csv", index=False)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     transform()
13 | 


--------------------------------------------------------------------------------
/data/tabular/mp_descriptions/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | from chemnlp.data.convert import remove_composition_rows
 4 | 
 5 | 
 6 | def process():
 7 |     dataset = load_dataset("kjappelbaum/chemnlp-robocrys")
 8 |     df = dataset["train"].to_pandas()
 9 |     df.dropna(
10 |         subset=["cifstr", "description", "description_w_bondlengths"], inplace=True
11 |     )
12 |     df["cifstr"] = df["cifstr"].apply(remove_composition_rows)
13 |     print(len(df))
14 |     df.to_csv("data_clean.csv", index=False)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     process()
19 | 


--------------------------------------------------------------------------------
/data/tabular/mp_self_supervised/prepare_data.py:
--------------------------------------------------------------------------------
 1 | import concurrent.futures
 2 | from glob import glob
 3 | 
 4 | import pandas as pd
 5 | from pymatgen.core import Structure
 6 | from tqdm import tqdm
 7 | 
 8 | from chemnlp.data.convert import cif_file_to_string, is_longer_than_allowed
 9 | 
10 | data = []
11 | 
12 | 
13 | def compile_info(ciffile):
14 |     s = Structure.from_file(ciffile)
15 |     cif = cif_file_to_string(ciffile)
16 |     sg, sg_n = s.get_space_group_info()
17 | 
18 |     d = {
19 |         "formula": s.composition.reduced_formula,
20 |         "density": s.density,
21 |         "spacegroup": sg,
22 |         "spacegroup_number": sg_n,
23 |         "cif": cif,
24 |         "is_longer_than_allowed": is_longer_than_allowed(cif),
25 |     }
26 |     return d
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     all_structures = glob("structures/*.cif")  # assumes structures have been downloaded
31 | 
32 |     data = []
33 |     with concurrent.futures.ProcessPoolExecutor() as executor:
34 |         for d in tqdm(
35 |             executor.map(compile_info, all_structures), total=len(all_structures)
36 |         ):
37 |             data.append(d)
38 | 
39 |     df = pd.DataFrame(data)
40 |     df.to_json("mpid.json")
41 | 


--------------------------------------------------------------------------------
/data/tabular/mp_self_supervised/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def remove_composition_from_cif(cif):
 5 |     # in the second line of cif, split at _ and then take the first element and join it with "cif"
 6 |     parts = cif.split("\n")
 7 |     parts[1] = "data_cif"
 8 |     return "\n".join(parts)
 9 | 
10 | 
11 | def process():
12 |     df = pd.read_json(
13 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-mp-cifs/resolve/main/mpid.json"
14 |     )
15 |     df = df.query("is_longer_than_allowed==False").dropna()
16 |     df["cif"] = df["cif"].apply(remove_composition_from_cif)
17 |     print(len(df))
18 |     df.to_csv("data_clean.csv", index=False)
19 | 
20 | 
21 | if __name__ == "__main__":
22 |     process()
23 | 


--------------------------------------------------------------------------------
/data/tabular/mp_shear_modulus/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def transform():
 5 |     dataset = load_dataset("kjappelbaum/chemnlp-mp-shear-modulus")["train"]
 6 |     df = dataset.to_pandas()
 7 |     print(len(df))
 8 |     df[["formula", "shear_modulus", "split"]].to_csv("data_clean.csv", index=False)
 9 | 
10 | 
11 | if __name__ == "__main__":
12 |     transform()
13 | 


--------------------------------------------------------------------------------
/data/tabular/ncbi_disease/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: ncbi_disease
 2 | description: |-
 3 |   ncbi_disease is a named entity recognition dataset for disease mentions.
 4 | targets:
 5 |   - id: matched_words
 6 |     description: matched words
 7 |     type: text
 8 |     names:
 9 |       - noun: entity
10 |       - noun: matched entity
11 | identifiers:
12 |   - id: sentence
13 |     description: Sentence
14 |     type: text
15 |     names:
16 |       - noun: sentence
17 |       - noun: text
18 | license: https://huggingface.co/datasets/bigbio/blurb/blob/main/LICENSE
19 | links:
20 |   - url: https://huggingface.co/datasets/bigbio/blurb
21 |     description: original dataset
22 | benchmarks:
23 |   - name: ncbi_disease
24 |     link: hhttps://huggingface.co/datasets/bigbio/blurb
25 |     split_column: split
26 | num_points: 7075
27 | bibtex:
28 |   - |-
29 |     @article{gu2021domain,
30 |         title = {
31 |             Domain-specific language model pretraining for biomedical natural
32 |             language processing
33 |         },
34 |         author = {
35 |             Gu, Yu and Tinn, Robert and Cheng, Hao and Lucas, Michael and
36 |             Usuyama, Naoto and Liu, Xiaodong and Naumann, Tristan and Gao,
37 |             Jianfeng and Poon, Hoifung
38 |         },
39 |         year = 2021,
40 |         journal = {ACM Transactions on Computing for Healthcare (HEALTH)},
41 |         publisher = {ACM New York, NY},
42 |         volume = 3,
43 |         number = 1,
44 |         pages = {1--23}
45 |     }
46 | templates:
47 |   - |-
48 |     Task: Find all the mentions of diseases in the {#following|subsequent!} {#text|sentence!}. Return the matching {#words|entities!}. If there is no {#match|mention of a disease|matching entity!}, return `no match`.
49 |     {#Sentence|Description!}: {sentence#}
50 |     Answer: {matched_words#}
51 |   - |-
52 |     User: Does the following text contain mentions of diseases?{# Can you return matches?| Can you output matches?!}
53 |     {#Text: |!}{sentence#}
54 |     Assistant: {#I found|There is!} {matched_words#}
55 | 


--------------------------------------------------------------------------------
/data/tabular/ncbi_disease/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datasets import load_dataset
 3 | 
 4 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner
 5 | from chemnlp.data.utils import oxford_comma_join
 6 | 
 7 | 
 8 | def process():
 9 |     # tokenized at whitespaces and punctuations
10 |     dataset = load_dataset("bigbio/blurb", "ncbi_disease")
11 |     dfs = []
12 |     for split in ["train", "validation", "test"]:
13 |         df_ = dataset[split].to_pandas()
14 |         df_["split"] = split
15 |         dfs.append(df_)
16 |     df = pd.concat(dfs)
17 |     ner_labels = df["ner_tags"]
18 | 
19 |     matched_words = []
20 |     for tokens, ner_label in zip(df["tokens"], ner_labels):
21 |         words = group_tokens_by_labels(tokens, ner_label)
22 |         if len(words) == 0:
23 |             matched_words.append("no match")
24 |         else:
25 |             matched_words.append(oxford_comma_join(words))
26 | 
27 |     df["matched_words"] = matched_words
28 |     df["sentence"] = df["tokens"].apply(punctuation_joiner)
29 | 
30 |     df = df[["sentence", "matched_words"]]
31 | 
32 |     # ensure we have at least 5 words in a sentence
33 |     df = df[df["sentence"].apply(lambda x: len(x.split()) >= 5)]
34 | 
35 |     print(len(df))
36 |     df.to_csv("data_clean.csv", index=False)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     process()
41 | 


--------------------------------------------------------------------------------
/data/tabular/nomad_structure/transform.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import pandas as pd
 3 | 
 4 | from chemnlp.data.convert import mask_cif_lines, remove_composition_rows
 5 | 
 6 | DATASET_NAME = "nomad-structure"
 7 | 
 8 | 
 9 | def prepare_data():
10 |     dataset_name = "n0w0f/nomad-structure-csv"
11 |     split_name = "train"  # data without any split @ hf
12 |     filename_to_save = "data_clean.csv"
13 | 
14 |     # Load the dataset from Hugging Face
15 |     dataset = datasets.load_dataset(dataset_name, split=split_name)
16 | 
17 |     df = pd.DataFrame(dataset)
18 |     df = df[~df["is_longer_than_allowed"]]
19 |     # assert column names
20 |     fields_orig = df.columns.tolist()
21 |     assert fields_orig == [
22 |         "cif",
23 |         "formula",
24 |         "spacegroup",
25 |         "spacegroup_number",
26 |         "crystal_system",
27 |         "pointgroup",
28 |         "density",
29 |         "is_longer_than_allowed",
30 |     ]
31 |     df["cif"] = df["cif"].apply(remove_composition_rows)
32 |     df["cif_masked"] = df["cif"].apply(mask_cif_lines)
33 |     # remove duplicates if any
34 |     df = df.drop_duplicates()
35 |     df.dropna(inplace=True)
36 |     df.to_csv(filename_to_save, index=False)
37 |     datapoints = len(df)
38 |     return datapoints
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     print(f" Preparing  clean tabular {DATASET_NAME} datatset")
43 |     datapoints = prepare_data()
44 |     print(
45 |         f" Finished Preparing  clean tabular {DATASET_NAME} datatset with {datapoints} datapoints"
46 |     )
47 | 


--------------------------------------------------------------------------------
/data/tabular/ocp/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: ocp
 2 | description: |-
 3 |   CatBerta training data.
 4 | targets:
 5 |   - id: target
 6 |     description: target
 7 |     type: continuous
 8 |     units: eV
 9 |     significant_digits: 4
10 |     names:
11 |       - noun: adsorption energy
12 | identifiers:
13 |   - id: text
14 |     type: text
15 |     description: description
16 | license: MIT (based on ocp)
17 | links:
18 |   - url: https://drive.google.com/drive/folders/1puiJ9FbLEA3QIHmZromecEndlemag9hg?usp=sharing
19 |     description: original data source
20 | num_points: 125000
21 | bibtex:
22 |   - |-
23 |     @article{ock2023catalyst,
24 |       title={Catalyst Property Prediction with CatBERTa: Unveiling Feature Exploration Strategies through Large Language Models},
25 |       author={Ock, Janghoon and Guntuboina, Chakradhar and Farimani, Amir Barati},
26 |       journal={arXiv preprint arXiv:2309.00563},
27 |       year={2023}
28 |     }
29 |   - |-
30 |     @article{ocp_dataset,
31 |         author = {Chanussot*, Lowik and Das*, Abhishek and Goyal*, Siddharth and Lavril*, Thibaut and Shuaibi*, Muhammed and Riviere, Morgane and Tran, Kevin and Heras-Domingo, Javier and Ho, Caleb and Hu, Weihua and Palizhati, Aini and Sriram, Anuroop and Wood, Brandon and Yoon, Junwoong and Parikh, Devi and Zitnick, C. Lawrence and Ulissi, Zachary},
32 |         title = {Open Catalyst 2020 (OC20) Dataset and Community Challenges},
33 |         journal = {ACS Catalysis},
34 |         year = {2021},
35 |         doi = {10.1021/acscatal.0c04525},
36 |     }
37 | templates:
38 |   - |-
39 |     Question: What is the adsorption energy of the following adsorbate-adsorbent pair?
40 |     Text: {text#}
41 |     Answer: {target#} {target__units}
42 |   - |-
43 |     Task: {#Predict|Estimate|Calculate|Compute|Determine!} the adsorption energy of the following adsorbate-adsorbent pair.
44 |     Text: {text#}
45 |     Answer: {target#} {target__units}
46 | 


--------------------------------------------------------------------------------
/data/tabular/ocp/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from datasets import load_dataset
 3 | from pylatexenc.latexencode import unicode_to_latex
 4 | 
 5 | 
 6 | def uniCode2Latex(text: str) -> str:
 7 |     """
 8 |     converts unicode text to latex and
 9 |     fixes UTF-8 chars for latex in a certain range:
10 |         ₀:$_0$ ... ₉:$_9$
11 | 
12 |     see https://github.com/phfaist/pylatexenc/issues/72
13 | 
14 |     Args:
15 |         text(str): the string to fix
16 | 
17 |     Return:
18 |         str: latex presentation of UTF-8 char
19 |     """
20 |     for code in range(8320, 8330):
21 |         text = text.replace(chr(code), f"$_{code-8320}$")
22 | 
23 |     text = text.replace("\u0305", "$^-$")
24 |     text = text.replace("\u207A", "$^+$")
25 |     text = text.replace("\u207B", "$^-$")
26 |     text = text.replace("\u2074", "$^4$")
27 |     text = text.replace("\u2070", "$^0$")
28 |     text = text.replace("\u2078", "$^1$")
29 |     text = text.replace("\u2075", "$^2$")
30 |     text = text.replace("\u2076", "$^3$")
31 |     text = text.replace("\u2077", "$^5$")
32 | 
33 |     return unicode_to_latex(text)
34 | 
35 | 
36 | def process():
37 |     dataset = load_dataset("kjappelbaum/chemnlp-ocp")
38 |     df_train = dataset["train"].to_pandas()
39 |     df_val = dataset["valid"].to_pandas()
40 | 
41 |     df_train["split"] = "train"
42 |     df_val["split"] = "valid"
43 | 
44 |     df = pd.concat([df_train, df_val])
45 |     df["text"] = df["text"].apply(uniCode2Latex)
46 |     print(len(df))
47 |     df.to_csv("data_clean.csv", index=False)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     process()
52 | 


--------------------------------------------------------------------------------
/data/tabular/opv/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def process():
 5 |     dataset = load_dataset("kjappelbaum/chemnlp-opv")["train"]
 6 |     df = dataset.to_pandas()
 7 |     df["LUMO"] = -1 * df["-LUMO (eV)"]
 8 |     df["HOMO"] = -1 * df["-HOMO (eV)"]
 9 |     df.rename(
10 |         columns={
11 |             "PCE_ave(%)": "PCE_ave",
12 |             "Voc (V)": "Voc",
13 |             "Jsc (mA cm^2)": "Jsc",
14 |             "Mw (kg mol^-1)": "Mw",
15 |             "Mn (kg mol^-1)": "Mn",
16 |             "PDI (=Mw/Mn)": "PDI",
17 |             "bandgap(eV)": "bandgap",
18 |         },
19 |         inplace=True,
20 |     )
21 | 
22 |     df = df.dropna(
23 |         subset=[
24 |             "HOMO",
25 |             "LUMO",
26 |             "Mw",
27 |             "PDI",
28 |             "FF",
29 |             "Jsc",
30 |             "Voc",
31 |             "PCE_ave",
32 |             "bandgap",
33 |         ]
34 |     )
35 |     print(len(df))
36 |     df.to_csv("data_clean.csv", index=False)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     process()
41 | 


--------------------------------------------------------------------------------
/data/tabular/oqmd/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def process():
 5 |     dataset = load_dataset("kjappelbaum/chemnlp-oqmd")["train"]
 6 |     df = dataset.to_pandas()
 7 | 
 8 |     df.dropna(
 9 |         subset=[
10 |             "name",
11 |             "formula",
12 |             "spacegroup",
13 |             "nelements",
14 |             "nsites",
15 |             "energy_per_atom",
16 |             "formation_energy_per_atom",
17 |             "band_gap",
18 |             "volume_per_atom",
19 |             "magnetization_per_atom",
20 |             "atomic_volume_per_atom",
21 |         ],
22 |         inplace=True,
23 |     )
24 |     print(len(df))
25 |     df.to_csv("data_clean.csv", index=False)
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     process()
30 | 


--------------------------------------------------------------------------------
/data/tabular/orbnet_denali/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def process():
 5 |     dataset = load_dataset("kjappelbaum/chemnlp-orbnet-denali")
 6 |     df = dataset["train"].to_pandas()
 7 |     df = df.dropna()
 8 |     print(len(df))
 9 |     df.rename(columns={"smiles": "SMILES"}, inplace=True)
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     process()
15 | 


--------------------------------------------------------------------------------
/data/tabular/ord_masked/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: ord_rxn_smiles_yield_pred
 2 | description: |-
 3 |   The open reaction database is a database of chemical reactions and their conditions
 4 | identifiers:
 5 |   - id: masked_rxn_smiles
 6 |     type: text
 7 |     description: reaction SMILES with one element masked
 8 |     names:
 9 |       - noun: reaction SMILES with one element masked as `MASK`
10 |       - noun: reaction SMILES with one element hidden as `MASK`
11 |       - noun: masked reaction SMILES (one component masked as `MASK`)
12 |       - noun: masked reaction SMILES string (one component masked as `MASK`)
13 |       - noun: masked RXNSMILES (one component masked as `MASK`)
14 | targets:
15 |   - id: missing_component
16 |     type: text
17 |     description: masked element
18 | license: CC BY SA 4.0
19 | links:
20 |   - url: https://github.com/open-reaction-database/ord-data
21 |     description: original data source
22 | num_points: 2263983
23 | bibtex:
24 |   - |-
25 |     @article{Kearnes_2021,
26 |       doi = {10.1021/jacs.1c09820},
27 |       url = {https://doi.org/10.1021%2Fjacs.1c09820},
28 |       year = 2021,
29 |       month = {nov},
30 |       publisher = {American Chemical Society ({ACS})},
31 |       volume = {143},
32 |       number = {45},
33 |       pages = {18820--18826},
34 |       author = {Steven M. Kearnes and Michael R. Maser
35 |       and Michael Wleklinski and Anton Kast and Abigail G. Doyle
36 |       and Spencer D. Dreher and Joel M. Hawkins
37 |       and Klavs F. Jensen and Connor W. Coley},
38 |       title = {The Open Reaction Database},
39 |       journal = {J. Am. Chem. Soc.}
40 |     }
41 | templates:
42 |   - The masked component in the {masked_rxn_smiles__names__noun} {masked_rxn_smiles#} is {missing_component#}.
43 |   - The {#chemical|compound!} with SMILES {missing_component#} is the masked component in the {masked_rxn_smiles__names__noun} {masked_rxn_smiles#}.
44 |   - |-
45 |     Question: What is the masked component in the {masked_rxn_smiles__names__noun} {masked_rxn_smiles#}?
46 |     Answer: {missing_component#}.
47 |   - |-
48 |     Task: Predict the masked component in a {masked_rxn_smiles__names__noun}.
49 |     Description: {masked_rxn_smiles#}
50 |     {#Answer|Solution!}: {missing_component#}
51 | 


--------------------------------------------------------------------------------
/data/tabular/ord_masked/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def process():
 6 |     file = hf_hub_download(
 7 |         repo_id="kjappelbaum/chemnlp-ord",
 8 |         filename="ord_rxn.json",
 9 |         repo_type="dataset",
10 |     )
11 |     df = pd.read_json(file)
12 |     df.dropna(subset=["masked_rxn_smiles", "missing_component"], inplace=True)
13 |     print(len(df))
14 |     df.to_csv("data_clean.csv", index=False)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     process()
19 | 


--------------------------------------------------------------------------------
/data/tabular/ord_predictions/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def oxford_comma_join(elements):
 6 |     try:
 7 |         if len(elements) == 1:
 8 |             return elements[0]
 9 |         elif len(elements) == 2:
10 |             return " and ".join(elements)
11 |         else:
12 |             return ", ".join(elements[:-1]) + ", and " + elements[-1]
13 |     except Exception:
14 |         return None
15 | 
16 | 
17 | def process():
18 |     file = hf_hub_download(
19 |         repo_id="kjappelbaum/chemnlp-ord",
20 |         filename="ord_rxn.json",
21 |         repo_type="dataset",
22 |     )
23 |     df = pd.read_json(file)
24 |     df["educt_string"] = df["educts"].apply(oxford_comma_join)
25 |     df["product_string"] = df["products"].apply(oxford_comma_join)
26 |     df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True)
27 |     df.dropna(subset=["educt_string", "product_string"], inplace=True)
28 |     print(len(df))
29 |     df[["RXNSMILES", "educt_string", "product_string"]].to_csv(
30 |         "data_clean.csv", index=False
31 |     )
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     process()
36 | 


--------------------------------------------------------------------------------
/data/tabular/ord_procedure_steps/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: ord_procedure_steps
 2 | description: |-
 3 |   The open reaction database is a database of chemical reactions and their conditions
 4 | identifiers:
 5 |   - id: steps_string
 6 |     type: text
 7 |     description: reaction action sequence
 8 |     names:
 9 |       - noun: reaction action sequence
10 |       - noun: reaction action steps
11 | targets:
12 |   - id: procedure
13 |     type: text
14 |     description: reaction procedure
15 |     names:
16 |       - noun: reaction procedure
17 |       - noun: description of reaction procedure
18 |       - noun: reaction procedure description
19 |       - noun: procedure
20 | license: CC BY SA 4.0
21 | links:
22 |   - url: https://github.com/open-reaction-database/ord-data
23 |     description: original data source
24 | num_points: 76815
25 | bibtex:
26 |   - |-
27 |     @article{Kearnes_2021,
28 |       doi = {10.1021/jacs.1c09820},
29 |       url = {https://doi.org/10.1021%2Fjacs.1c09820},
30 |       year = 2021,
31 |       month = {nov},
32 |       publisher = {American Chemical Society ({ACS})},
33 |       volume = {143},
34 |       number = {45},
35 |       pages = {18820--18826},
36 |       author = {Steven M. Kearnes and Michael R. Maser
37 |       and Michael Wleklinski and Anton Kast and Abigail G. Doyle
38 |       and Spencer D. Dreher and Joel M. Hawkins
39 |       and Klavs F. Jensen and Connor W. Coley},
40 |       title = {The Open Reaction Database},
41 |       journal = {J. Am. Chem. Soc.}
42 |     }
43 | templates:
44 |   - |-
45 |     User: {#Can you|Could you!} {#tell me|give me|show me!} the {procedure__names__noun} for the {steps_string__names__noun} {steps_string#}?
46 |     Assistant: {#I propose|I suggest!} the {procedure__names__noun} {procedure#}
47 |   - |-
48 |     User: {#Can you|Could you!} {#tell me|give me|show me!} the {steps_string__names__noun} for the {procedure__names__noun} {procedure#}?
49 |     Assistant: {#I propose|I suggest!} the {steps_string__names__noun} {steps_string#}
50 |   - |-
51 |     Task: Convert a {procedure__names__noun} into a {steps_string__names__noun}.
52 |     Procedure: {procedure#}
53 |     Answer: {steps_string#}
54 | 


--------------------------------------------------------------------------------
/data/tabular/ord_procedure_steps/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def process():
 6 |     file = hf_hub_download(
 7 |         repo_id="kjappelbaum/chemnlp-ord",
 8 |         filename="ord_data_compiled.json",
 9 |         repo_type="dataset",
10 |     )
11 |     df = pd.read_json(file)
12 |     df = df.dropna(subset=["steps_string", "procedure"])
13 |     df.query("steps_string != 'None'", inplace=True)
14 |     df.query("procedure != 'None'", inplace=True)
15 |     df = df[["steps_string", "procedure"]]
16 |     print(len(df))
17 |     df.to_csv("data_clean.csv", index=False)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     process()
22 | 


--------------------------------------------------------------------------------
/data/tabular/ord_rxn_smiles_procedure/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | from rxn.chemutils.reaction_equation import rxn_standardization
 4 | from rxn.chemutils.reaction_smiles import parse_any_reaction_smiles
 5 | 
 6 | 
 7 | def canoncialize_rxn_smiles(rxn_smiles):
 8 |     try:
 9 |         return rxn_standardization(parse_any_reaction_smiles(rxn_smiles)).to_string()
10 |     except Exception:
11 |         return None
12 | 
13 | 
14 | def process():
15 |     file = hf_hub_download(
16 |         repo_id="kjappelbaum/chemnlp-ord",
17 |         filename="ord_data_compiled.json",
18 |         repo_type="dataset",
19 |     )
20 |     df = pd.read_json(file)
21 |     df["canonical_rxn_smiles"] = df["rxn_smiles"].apply(canoncialize_rxn_smiles)
22 |     df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True)
23 |     df = df.dropna(subset=["RXNSMILES", "procedure"])
24 |     df = df.query("RXNSMILES != 'None'")
25 |     # make sure RXNSMILES values have at least 10 characters
26 |     df = df[df["RXNSMILES"].str.len() > 10]
27 |     # there must be > in the reaction SMILES
28 |     df = df[df["RXNSMILES"].str.contains(">")]
29 |     df = df.query("procedure != 'None'")
30 |     df.query(
31 |         "steps_string != 'None'", inplace=True
32 |     )  # this removes cases in which is just says "follow the procedure above"
33 |     df = df.query("procedure != ''")
34 |     df = df[["RXNSMILES", "procedure"]]
35 |     print(len(df))
36 |     df.to_csv("data_clean.csv", index=False)
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     process()
41 | 


--------------------------------------------------------------------------------
/data/tabular/ord_rxn_smiles_yield_pred/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: ord_rxn_smiles_yield_pred
 2 | description: |-
 3 |   The open reaction database is a database of chemical reactions and their conditions
 4 | targets:
 5 |   - id: yield
 6 |     type: continuous
 7 |     significant_digits: 0
 8 |     description: reaction yield
 9 |     units: \%
10 |     names:
11 |       - noun: yield
12 |       - noun: reaction yield
13 | identifiers:
14 |   - id: RXNSMILES
15 |     type: RXNSMILES
16 |     description: reaction SMILES
17 |     names:
18 |       - noun: reaction SMILES
19 |       - noun: reaction SMILES string
20 |       - noun: RXNSMILES
21 |       - noun: reaction SMILES (RXNSMILES)
22 | license: CC BY SA 4.0
23 | links:
24 |   - url: https://github.com/open-reaction-database/ord-data
25 |     description: original data source
26 | num_points: 28
27 | bibtex:
28 |   - |-
29 |     @article{Kearnes_2021,
30 |       doi = {10.1021/jacs.1c09820},
31 |       url = {https://doi.org/10.1021%2Fjacs.1c09820},
32 |       year = 2021,
33 |       month = {nov},
34 |       publisher = {American Chemical Society ({ACS})},
35 |       volume = {143},
36 |       number = {45},
37 |       pages = {18820--18826},
38 |       author = {Steven M. Kearnes and Michael R. Maser
39 |       and Michael Wleklinski and Anton Kast and Abigail G. Doyle
40 |       and Spencer D. Dreher and Joel M. Hawkins
41 |       and Klavs F. Jensen and Connor W. Coley},
42 |       title = {The Open Reaction Database},
43 |       journal = {J. Am. Chem. Soc.}
44 |     }
45 | templates:
46 |   - The {yield__names__noun} of a reaction with the {RXNSMILES__names__noun} {RXNSMILES#} is {yield#}{yield__units}.
47 |   - |-
48 |     User: {#I need|I want|I would like!} to run a reaction with the {RXNSMILES__names__noun} {RXNSMILES#}. What is the {yield__names__noun} {#I can expect|I should expect|I should get|I can get!}?
49 |     Assistant: {#The|The expected|The predicted|The estimated!} {yield__names__noun} is {yield#}{yield__units}.
50 |   - |-
51 |     Question: {#What is|What's|What is the|What's the!} {yield__names__noun} of a reaction with the {RXNSMILES__names__noun} {RXNSMILES#}?
52 |     Answer: {yield#}{yield__units}.
53 | 


--------------------------------------------------------------------------------
/data/tabular/ord_rxn_smiles_yield_pred/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | from rxn.chemutils.reaction_equation import rxn_standardization
 4 | from rxn.chemutils.reaction_smiles import parse_any_reaction_smiles
 5 | 
 6 | 
 7 | def canoncialize_rxn_smiles(rxn_smiles):
 8 |     try:
 9 |         return rxn_standardization(parse_any_reaction_smiles(rxn_smiles)).to_string()
10 |     except Exception:
11 |         return None
12 | 
13 | 
14 | def process():
15 |     file = hf_hub_download(
16 |         repo_id="kjappelbaum/chemnlp-ord",
17 |         filename="ord_data_compiled.json",
18 |         repo_type="dataset",
19 |     )
20 |     df = pd.read_json(file)
21 |     df["canonical_rxn_smiles"] = df["rxn_smiles"].apply(canoncialize_rxn_smiles)
22 |     df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True)
23 |     df = df.dropna(subset=["RXNSMILES", "yield"])
24 |     # make sure RXNSMILES values have at least 10 characters
25 |     df = df[df["RXNSMILES"].str.len() > 10]
26 |     # there must be > in the reaction SMILES
27 |     df = df[df["RXNSMILES"].str.contains(">")]
28 |     df.query(
29 |         "steps_string != 'None'", inplace=True
30 |     )  # this removes cases in which is just says "follow the procedure above"
31 |     df = df.query("RXNSMILES != 'None'")
32 |     df = df[["RXNSMILES", "yield"]]
33 |     print(len(df))
34 |     df.to_csv("data_clean.csv", index=False)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     process()
39 | 


--------------------------------------------------------------------------------
/data/tabular/ord_steps_yield/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: ord_steps_yield
 2 | description: |-
 3 |   The open reaction database is a database of chemical reactions and their conditions
 4 | identifiers:
 5 |   - id: non_yield_steps_string
 6 |     type: text
 7 |     description: reaction action sequence
 8 |     names:
 9 |       - noun: reaction action sequence
10 |       - noun: reaction action steps
11 | targets:
12 |   - id: yield
13 |     type: continuous
14 |     significant_digits: 0
15 |     description: reaction yield
16 |     units: \%
17 |     names:
18 |       - noun: yield
19 |       - noun: reaction yield
20 | license: CC BY SA 4.0
21 | links:
22 |   - url: https://github.com/open-reaction-database/ord-data
23 |     description: original data source
24 | num_points: 30
25 | bibtex:
26 |   - |-
27 |     @article{Kearnes_2021,
28 |       doi = {10.1021/jacs.1c09820},
29 |       url = {https://doi.org/10.1021%2Fjacs.1c09820},
30 |       year = 2021,
31 |       month = {nov},
32 |       publisher = {American Chemical Society ({ACS})},
33 |       volume = {143},
34 |       number = {45},
35 |       pages = {18820--18826},
36 |       author = {Steven M. Kearnes and Michael R. Maser
37 |       and Michael Wleklinski and Anton Kast and Abigail G. Doyle
38 |       and Spencer D. Dreher and Joel M. Hawkins
39 |       and Klavs F. Jensen and Connor W. Coley},
40 |       title = {The Open Reaction Database},
41 |       journal = {J. Am. Chem. Soc.}
42 |     }
43 | templates:
44 |   - |-
45 |     The {yield__names__noun} of a reaction with the {non_yield_steps_string__names__noun} below is {yield#}{yield__units}.
46 |     {non_yield_steps_string__names__noun}: {non_yield_steps_string#}
47 |   - |-
48 |     User: {#I need|I want|I would like!} to run a reaction with the {non_yield_steps_string__names__noun} {non_yield_steps_string#}. What is the {yield__names__noun} {#I can expect|I should expect|I should get|I can get!}?
49 |     Assistant: {#The|The expected|The predicted|The estimated!} {yield__names__noun} is {yield#}{yield__units}.
50 |   - |-
51 |     Task: {#Predict|Estimate!} the {yield__names__noun} of a reaction based on the {non_yield_steps_string__names__noun}.
52 |     Description: {non_yield_steps_string#}
53 |     Answer: {yield#}{yield__units}
54 | 


--------------------------------------------------------------------------------
/data/tabular/ord_steps_yield/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def process():
 6 |     file = hf_hub_download(
 7 |         repo_id="kjappelbaum/chemnlp-ord",
 8 |         filename="ord_data_compiled.json",
 9 |         repo_type="dataset",
10 |     )
11 |     df = pd.read_json(file)
12 |     df = df.dropna(subset=["non_yield_steps_string", "yield"])
13 |     df = df.query("non_yield_steps_string != 'None'")
14 |     df = df[["non_yield_steps_string", "yield"]]
15 |     print(len(df))
16 |     df.to_csv("data_clean.csv", index=False)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     process()
21 | 


--------------------------------------------------------------------------------
/data/tabular/perovskite_db/transform.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | from datasets import load_dataset
 4 | 
 5 | 
 6 | def oxford_comma_join(list_of_str):
 7 |     if len(list_of_str) == 1:
 8 |         return list_of_str[0]
 9 |     elif len(list_of_str) == 2:
10 |         return " and ".join(list_of_str)
11 |     else:
12 |         return ", ".join(list_of_str[:-1]) + ", and " + list_of_str[-1]
13 | 
14 | 
15 | def preprocess():
16 |     df = load_dataset("kjappelbaum/pervoskite_db", delimiter="|")["train"].to_pandas()
17 |     df.dropna(
18 |         subset=[
19 |             "device_stack",
20 |             "pce",
21 |             "ff",
22 |             "jsc",
23 |             "voc",
24 |             "reduced_formulas",
25 |             "descriptive_formulas",
26 |             "iupac_formulas",
27 |             "bandgap",
28 |         ],
29 |         inplace=True,
30 |     )
31 |     device_stack_strings = []
32 | 
33 |     df["pce"] = df["pce"].round(2)
34 |     df["ff"] = df["ff"].round(2)
35 |     df["jsc"] = df["jsc"].round(2)
36 |     df["voc"] = df["voc"].round(2)
37 |     df["bandgap"] = df["bandgap"].round(2)
38 | 
39 |     for _i, row in df.iterrows():
40 |         device_stack = ast.literal_eval(row["device_stack"])
41 |         device_stack_string = oxford_comma_join(device_stack)
42 |         absorber = row["descriptive_formulas"]
43 |         device_stack_string = device_stack_string.replace("Perovskite", absorber)
44 |         device_stack_strings.append(device_stack_string)
45 | 
46 |     df["device_stack_string"] = device_stack_strings
47 |     df[
48 |         [
49 |             "pce",
50 |             "ff",
51 |             "jsc",
52 |             "voc",
53 |             "bandgap",
54 |             "reduced_formulas",
55 |             "descriptive_formulas",
56 |             "iupac_formulas",
57 |             "device_stack_string",
58 |         ]
59 |     ].to_csv("data_clean.csv", index=False)
60 |     print(len(df))
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     preprocess()
65 | 


--------------------------------------------------------------------------------
/data/tabular/physics_stackexchange/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: physics_stackexchange
 2 | description: |-
 3 |   Questions and answers mined from physics.stackexchange.com.
 4 | targets:
 5 |   - id: a
 6 |     description: answer to the question
 7 |     type: string
 8 |   - id: title
 9 |     description: title of the question
10 |     type: string
11 | identifiers:
12 |   - id: q
13 |     type: string
14 |     description: question asked on physics.stackexchange.com
15 | license: CC BY-SA
16 | links:
17 |   - url: physics.stackexchange.com
18 |     description: original data source
19 |   - url: https://stackoverflow.com/help/licensing
20 |     description: information about the license
21 | num_points: 6732
22 | templates:
23 |   - |-
24 |     {#Task: Please answer the question of the user.|Task: Provide a detailed response to the user's question.|Task: Address the user's query with a well-structured answer.|Task: Your role is to respond to the user's question with clarity.|Task: Offer a concise and informative answer to the user's question.|Task: Provide a clear and concise reply to the user's inquiry.!}
25 |     {#User: |Question: |Inquiry: |\n!}{#q}
26 |     {#Assistant: |Answer: !}{#a}
27 |   - |-
28 |     {#Task: Generate a title for this question.|Task: Create a meaningful title for this question.|Task: Summarize the question in a title.!}
29 |     {#Question: |Inquiry: |\n!}{#q}
30 |     {#Assistant: |Title: |Answer: |!}{#title}
31 | 


--------------------------------------------------------------------------------
/data/tabular/qm8/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def process():
 5 |     df = pd.read_json(
 6 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-qm8/resolve/main/qm8.json"
 7 |     )
 8 | 
 9 |     df = df.replace("RDKit", "ChemNLP", regex=True)
10 |     df.dropna(inplace=True)
11 |     df = df.rename(columns={"smiles": "SMILES"})
12 |     df = df.query("is_longer_than_allowed==False")
13 |     columns = [
14 |         "E1-CC2",
15 |         "E2-CC2",
16 |         "f1-CC2",
17 |         "f2-CC2",
18 |         "E1-PBE0",
19 |         "E2-PBE0",
20 |         "f1-PBE0",
21 |         "f2-PBE0",
22 |         "E1-CAM",
23 |         "E2-CAM",
24 |         "f1-CAM",
25 |         "f2-CAM",
26 |     ]
27 |     # filter out rows in which one of the columns is not a float. Filter explicitly for the row in which
28 |     # the values for all those columns are floats.
29 |     df = df[df[columns].apply(lambda x: x.apply(lambda y: isinstance(y, float))).all(1)]
30 |     df[columns] = df[columns].astype(float)
31 |     print(len(df))
32 |     df.to_csv("data_clean.csv", index=False)
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     process()
37 | 


--------------------------------------------------------------------------------
/data/tabular/qm9/transform.py:
--------------------------------------------------------------------------------
 1 | import datasets
 2 | import pandas as pd
 3 | 
 4 | DATASET_NAME = "qm9"
 5 | 
 6 | 
 7 | def prepare_data():
 8 |     dataset_name = "n0w0f/qm9-csv"
 9 |     split_name = "train"  # data without any split @ hf
10 |     filename_to_save = "data_clean.csv"
11 | 
12 |     # Load the dataset from Hugging Face
13 |     dataset = datasets.load_dataset(dataset_name, split=split_name)
14 | 
15 |     df = pd.DataFrame(dataset)
16 | 
17 |     # assert column names
18 |     fields_orig = df.columns.tolist()
19 |     assert fields_orig == [
20 |         "inchi",
21 |         "smiles",
22 |         "rotational_constant_a",
23 |         "rotational_constant_b",
24 |         "rotational_constant_c",
25 |         "dipole_moment",
26 |         "polarizability",
27 |         "homo",
28 |         "lumo",
29 |         "gap",
30 |         "r2",
31 |         "zero_point_energy",
32 |         "u0",
33 |         "u298",
34 |         "h298",
35 |         "g298",
36 |         "heat_capacity",
37 |     ]
38 |     assert not df.duplicated().sum()
39 | 
40 |     # remove duplicates if any
41 |     df = df.drop_duplicates()
42 | 
43 |     datapoints = len(df)
44 |     # some parts of the code assume that "SMILES" is in upper case, rename this column
45 |     df.rename(columns={"smiles": "SMILES"}, inplace=True)
46 |     df = df.replace("RDKit", "ChemNLP", regex=True)
47 |     df.to_csv(filename_to_save, index=False)
48 |     return datapoints
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     print(f" Preparing  clean tabular {DATASET_NAME} datatset")
53 |     datapoints = prepare_data()
54 |     print(
55 |         f" Finished Preparing  clean tabular {DATASET_NAME} datatset with {datapoints} datapoints"
56 |     )
57 | 


--------------------------------------------------------------------------------
/data/tabular/qmof_quantum/transform.py:
--------------------------------------------------------------------------------
 1 | import ast
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def process():
 7 |     df = pd.read_json(
 8 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-qmof-data/resolve/main/qmof_data.json"
 9 |     )
10 | 
11 |     df.dropna(
12 |         subset=[
13 |             "outputs.pbe.bandgap",
14 |             "outputs.pbe.cbm",
15 |             "outputs.pbe.vbm",
16 |             "outputs.hle17.bandgap",
17 |             "outputs.hle17.cbm",
18 |             "outputs.hle17.vbm",
19 |             "outputs.hse06.bandgap",
20 |             "outputs.hse06.cbm",
21 |             "outputs.hse06.vbm",
22 |             "info.pld",
23 |             "info.lcd",
24 |             "info.density",
25 |             "info.mofid.mofid",
26 |             "info.mofid.smiles_nodes",
27 |             "info.mofid.smiles_linkers",
28 |             "info.mofid.topology",
29 |             "info.symmetry.spacegroup_number",
30 |         ],
31 |         inplace=True,
32 |     )
33 | 
34 |     df["info.mofid.smiles_nodes"] = df["info.mofid.smiles_nodes"].apply(
35 |         lambda x: ", ".join(ast.literal_eval(x))
36 |     )
37 | 
38 |     df["info.mofid.smiles_linkers"] = df["info.mofid.smiles_linkers"].apply(
39 |         lambda x: ", ".join(ast.literal_eval(x))
40 |     )
41 | 
42 |     print(len(df))
43 | 
44 |     df.to_csv("data_clean.csv", index=False)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     process()
49 | 


--------------------------------------------------------------------------------
/data/tabular/rdkit_features/transform.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | 
 3 | import fire
 4 | from datasets import load_dataset
 5 | 
 6 | 
 7 | def clean_df(df):
 8 |     df.dropna(inplace=True)
 9 |     df[
10 |         [
11 |             "NumHDonors",
12 |             "NumHAcceptors",
13 |             "NumHeteroatoms",
14 |             "RingCount",
15 |             "NumRotatableBonds",
16 |             "NumAromaticBonds",
17 |             "NumAcidGroups",
18 |             "NumBasicGroups",
19 |         ]
20 |     ] = df[
21 |         [
22 |             "NumHDonors",
23 |             "NumHAcceptors",
24 |             "NumHeteroatoms",
25 |             "RingCount",
26 |             "NumRotatableBonds",
27 |             "NumAromaticBonds",
28 |             "NumAcidGroups",
29 |             "NumBasicGroups",
30 |         ]
31 |     ].astype(
32 |         int
33 |     )
34 |     df["MolLogP"] = df["MolLogP"].astype(float)
35 |     df["Apol"] = df["Apol"].astype(float)
36 |     df.rename(columns={"text": "SMILES"}, inplace=True)
37 |     return df
38 | 
39 | 
40 | def process():
41 |     if not (os.path.isfile("data_clean.csv")):
42 |         df = load_dataset(
43 |             "maykcaldas/smiles-transformers", split="validation"
44 |         ).to_pandas()
45 |         df = clean_df(df)
46 |         df["split"] = "valid"
47 |         df.to_csv("data_clean.csv", index=False)
48 |         del df
49 | 
50 |         df = load_dataset("maykcaldas/smiles-transformers", split="test").to_pandas()
51 |         df = clean_df(df)
52 |         df["split"] = "test"
53 |         df.to_csv("data_clean.csv", index=False, mode="a", header=False)
54 |         del df
55 | 
56 |         splits = [f"train[{k}%:{k+5}%]" for k in range(0, 100, 5)]
57 |         for s in splits:
58 |             df = load_dataset("maykcaldas/smiles-transformers", split=s).to_pandas()
59 |             df = clean_df(df)
60 |             df["split"] = "train"
61 |             df.to_csv("data_clean.csv", index=False, mode="a", header=False)
62 |     else:
63 |         print("Reusing present data_clean.csv.")
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     fire.Fire(process)
68 | 


--------------------------------------------------------------------------------
/data/tabular/rhea_db_masked/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def process():
 5 |     df = pd.read_json(
 6 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-rhea-db/resolve/main/rhea-reaction-smiles_prompts.json"
 7 |     )
 8 |     df.dropna(subset=["masked_rxn_smiles", "missing_component"], inplace=True)
 9 |     print(len(df))
10 |     df.to_csv("data_clean.csv", index=False)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     process()
15 | 


--------------------------------------------------------------------------------
/data/tabular/rhea_db_predictions/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def oxford_comma_join(elements):
 5 |     if len(elements) == 1:
 6 |         return elements[0]
 7 |     elif len(elements) == 2:
 8 |         return " and ".join(elements)
 9 |     else:
10 |         return ", ".join(elements[:-1]) + ", and " + elements[-1]
11 | 
12 | 
13 | def process():
14 |     df = pd.read_json(
15 |         "https://huggingface.co/datasets/kjappelbaum/chemnlp-rhea-db/resolve/main/rhea-reaction-smiles_prompts.json"
16 |     )
17 |     df["educt_string"] = df["educts"].apply(oxford_comma_join)
18 |     df["product_string"] = df["products"].apply(oxford_comma_join)
19 |     df.rename(columns={"canonical_rxn_smiles": "RXNSMILES"}, inplace=True)
20 |     df.dropna(subset=["educt_string", "product_string"], inplace=True)
21 |     print(len(df))
22 |     df[["RXNSMILES", "educt_string", "product_string"]].to_csv(
23 |         "data_clean.csv", index=False
24 |     )
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     process()
29 | 


--------------------------------------------------------------------------------
/data/tabular/run_all_transform.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | eval "$(conda shell.bash hook)"
 4 | conda activate chemnlp
 5 | 
 6 | for dir in */
 7 | do (
 8 |     echo "$dir"
 9 |     cd "$dir"
10 |     python transform.py
11 | )
12 | done
13 | 


--------------------------------------------------------------------------------
/data/tabular/sigma_aldrich_safety_data/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def transform_data():
 6 |     file = hf_hub_download(
 7 |         repo_id="chemNLP/msds_sigma_aldrich",
 8 |         filename="msds.csv",
 9 |         repo_type="dataset",
10 |     )
11 | 
12 |     df = pd.read_csv(file)
13 |     df = df.drop(columns=["h_statements"])
14 |     df = df.dropna()
15 |     df.to_csv("data_clean.csv")
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     transform_data()
20 | 


--------------------------------------------------------------------------------
/data/tabular/smiles_to_3d/transform.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | from chemnlp.data.convert import is_longer_than_allowed
 4 | 
 5 | 
 6 | def process():
 7 |     dataset = load_dataset("kjappelbaum/chemnlp-qm9-file-translation")
 8 |     df = dataset["train"].to_pandas()
 9 |     df.replace(to_replace="RDKit", value="ChemNLP", inplace=True)
10 |     df["is_longer_than_allowed"] = df["mol2000"].apply(is_longer_than_allowed)
11 |     df = df[~df["is_longer_than_allowed"]]
12 |     print(len(df))
13 |     df = df.replace("RDKit", "ChemNLP", regex=True)
14 |     df.to_csv("data_clean.csv", index=False)
15 | 
16 | 
17 | if __name__ == "__main__":
18 |     process()
19 | 


--------------------------------------------------------------------------------
/data/tabular/thermosol/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | def process():
 5 |     df = pd.read_csv(
 6 |         "http://deepchem.io.s3-website-us-west-1.amazonaws.com/datasets/thermosol.csv"
 7 |     )
 8 |     df.rename(columns={"smile": "SMILES"}, inplace=True)
 9 |     df.dropna(inplace=True)
10 |     print(len(df))
11 |     df[["SMILES", "target"]].to_csv("data_clean.csv", index=False)
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     process()
16 | 


--------------------------------------------------------------------------------
/data/tabular/uniprot_binding_single/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | DATA = "uniprot_binding_sites"
 5 | 
 6 | 
 7 | def load_dataset() -> pd.DataFrame:
 8 |     uniprot = hf_hub_download(
 9 |         repo_id="chemnlp/uniprot",
10 |         filename=f"{DATA}/data_clean.csv",
11 |         repo_type="dataset",
12 |     )
13 |     uniprot = pd.read_csv(uniprot)
14 |     uniprot.end_binding_site = uniprot.end_binding_site.astype(int)
15 |     uniprot.drop_duplicates(
16 |         inplace=True,
17 |     )
18 |     uniprot = uniprot[uniprot.end_binding_site == uniprot.start_binding_site]
19 |     print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
20 |     uniprot.to_csv("data_clean.csv", index=False)
21 |     print(f"Successfully loaded {DATA}!")
22 |     return uniprot
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     load_dataset()
27 | 


--------------------------------------------------------------------------------
/data/tabular/uniprot_binding_sites_multiple/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | DATA = "uniprot_binding_sites"
 5 | 
 6 | 
 7 | def load_dataset() -> pd.DataFrame:
 8 |     uniprot = hf_hub_download(
 9 |         repo_id="chemnlp/uniprot",
10 |         filename=f"{DATA}/data_clean.csv",
11 |         repo_type="dataset",
12 |     )
13 |     uniprot = pd.read_csv(uniprot)
14 |     uniprot.end_binding_site = uniprot.end_binding_site.astype(int)
15 |     uniprot.drop_duplicates(
16 |         inplace=True,
17 |     )
18 |     uniprot = uniprot[uniprot.end_binding_site > uniprot.start_binding_site]
19 |     print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
20 |     uniprot.to_csv("data_clean.csv", index=False)
21 |     print(f"Successfully loaded {DATA}!")
22 |     return uniprot
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     load_dataset()
27 | 


--------------------------------------------------------------------------------
/data/tabular/uniprot_organisms/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: uniprot_organisms
 2 | description: |-
 3 |   Organisms in which a amino-acid sequence can be found.
 4 | targets:
 5 |   - id: organisms
 6 |     description: organisms in which a protein can be found
 7 |     type: text
 8 |     names:
 9 |       - noun: organisms
10 | identifiers:
11 |   - id: other
12 |     type: AS_SEQUENCE
13 |     description: other
14 | license: MIT
15 | links:
16 |   - url: https://www.uniprot.org/
17 |     description: data source
18 | num_points: 559428
19 | bibtex:
20 |   - |-
21 |     @article{10.1093/nar/gkac1052,
22 |     author = {The UniProt Consortium},
23 |     title = {UniProt - the Universal Protein Knowledgebase in 2023},
24 |     journal = {Nucleic Acids Research},
25 |     volume = {51},
26 |     number = {D1},
27 |     pages = {D523-D531},
28 |     year = {2022},
29 |     month = {11},
30 |     issn = {0305-1048},
31 |     doi = {10.1093/nar/gkac1052},
32 |     url = {https://doi.org/10.1093/nar/gkac1052}}
33 | templates:
34 |   - |-
35 |     The protein with the {#amino acid sequence|AA sequence!} {other#} can be found in {#the organism |!}{organisms#}.
36 |   - |-
37 |     Task: {#Predict|Identify!} the organism in which {#the below|this!} {#protein|amino acid sequence|AA sequence|polypeptide!} can be found.
38 |     {#Amino acid sequence |Sequence|AA sequence!}: {other#}
39 |     Result: {organisms#}
40 |   - |-
41 |     User: In what organism can you find the following {#protein|amino acid sequence|AA sequence|polypeptide!}:\n{other#}
42 |     Assistant: The given {#protein|amino acid sequence|AA sequence|polypeptide!} can be found in {organisms#}.
43 |   - |-
44 |     Task: {#Predict|Identify!} the organism in which {#the below|this!} {#protein|amino acid sequence|AA sequence|polypeptide!} can be found.
45 |     {#Amino acid sequence|Sequence|AA sequence!}: {other#}
46 |     Result:<EOI>{organisms#}
47 | 


--------------------------------------------------------------------------------
/data/tabular/uniprot_organisms/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | DATA = "uniprot_organisms"
 5 | 
 6 | 
 7 | def load_dataset() -> pd.DataFrame:
 8 |     uniprot = hf_hub_download(
 9 |         repo_id="chemnlp/uniprot",
10 |         filename=f"{DATA}/data_clean.csv",
11 |         repo_type="dataset",
12 |     )
13 |     uniprot = pd.read_csv(uniprot)
14 |     uniprot.rename(columns={"sequence": "other"}, inplace=True)
15 |     uniprot.drop_duplicates(
16 |         inplace=True,
17 |     )
18 |     print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
19 |     uniprot.to_csv("data_clean.csv", index=False)
20 |     print(f"Successfully loaded {DATA}!")
21 |     return uniprot
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     load_dataset()
26 | 


--------------------------------------------------------------------------------
/data/tabular/uniprot_reactions/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | DATA = "uniprot_reactions"
 5 | 
 6 | 
 7 | def load_dataset() -> pd.DataFrame:
 8 |     uniprot = hf_hub_download(
 9 |         repo_id="chemnlp/uniprot",
10 |         filename=f"{DATA}/data_clean.csv",
11 |         repo_type="dataset",
12 |     )
13 |     uniprot = pd.read_csv(uniprot)
14 |     uniprot.rename(columns={"sequence": "other"}, inplace=True)
15 |     uniprot.drop_duplicates(
16 |         inplace=True,
17 |     )
18 |     print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
19 |     uniprot.to_csv("data_clean.csv", index=False)
20 |     print(f"Successfully loaded {DATA}!")
21 |     return uniprot
22 | 
23 | 
24 | if __name__ == "__main__":
25 |     load_dataset()
26 | 


--------------------------------------------------------------------------------
/data/tabular/uniprot_sentences/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import regex as re
 3 | from huggingface_hub import hf_hub_download
 4 | 
 5 | DATA = "uniprot_sentences"
 6 | 
 7 | 
 8 | def clean_up_sentences(text: str) -> str:
 9 |     "Remove (By similarity) from the sentences"
10 | 
11 |     updated_text = re.sub(r"\s*\((?:By\.? similarity)\)\s*", "", text)
12 |     updated_text = updated_text.replace(" . ", ". ")
13 |     updated_text = updated_text.replace(" .", ".")
14 |     updated_text = updated_text.strip()
15 |     if not (updated_text.endswith(".")):
16 |         updated_text += "."
17 |     return updated_text
18 | 
19 | 
20 | def load_dataset() -> pd.DataFrame:
21 |     uniprot = hf_hub_download(
22 |         repo_id="chemnlp/uniprot",
23 |         filename=f"{DATA}/data_clean.csv",
24 |         repo_type="dataset",
25 |     )
26 | 
27 |     uniprot = pd.read_csv(uniprot)
28 |     uniprot.sentences = uniprot.sentences.apply(clean_up_sentences)
29 |     uniprot.drop_duplicates(
30 |         inplace=True,
31 |     )
32 |     print(f"Successfully loaded {DATA}! {len(uniprot)} rows")
33 |     uniprot.to_csv("data_clean.csv", index=False)
34 |     print(f"Successfully loaded {DATA}!")
35 |     return uniprot
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     load_dataset()
40 | 


--------------------------------------------------------------------------------
/data/tabular/uspto/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def oxford_comma_join(elements):
 6 |     try:
 7 |         if len(elements) == 1:
 8 |             return elements[0]
 9 |         elif len(elements) == 2:
10 |             return " and ".join(elements)
11 |         else:
12 |             return ", ".join(elements[:-1]) + ", and " + elements[-1]
13 |     except Exception:
14 |         return None
15 | 
16 | 
17 | def process():
18 |     file_train = hf_hub_download(
19 |         repo_id="kjappelbaum/chemnlp-uspto",
20 |         filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json",
21 |         repo_type="dataset",
22 |     )
23 |     df_train = pd.read_json(file_train)
24 |     df_train["split"] = "train"
25 | 
26 |     file_test = hf_hub_download(
27 |         repo_id="kjappelbaum/chemnlp-uspto",
28 |         filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json",
29 |         repo_type="dataset",
30 |     )
31 |     df_test = pd.read_json(file_test)
32 |     df_test["split"] = "test"
33 | 
34 |     file_valid = hf_hub_download(
35 |         repo_id="kjappelbaum/chemnlp-uspto",
36 |         filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json",
37 |         repo_type="dataset",
38 |     )
39 | 
40 |     df_valid = pd.read_json(file_valid)
41 |     df_valid["split"] = "valid"
42 | 
43 |     df = pd.concat([df_train, df_test, df_valid])
44 | 
45 |     df["educt_string"] = df["educts"].apply(oxford_comma_join)
46 |     df["product_string"] = df["products"].apply(oxford_comma_join)
47 |     df["RXNSMILES"] = df["canonical_rxn_smiles"]
48 | 
49 |     print(len(df))
50 |     df.to_csv("data_clean.csv", index=False)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     process()
55 | 


--------------------------------------------------------------------------------
/data/tabular/uspto_yield/transform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from huggingface_hub import hf_hub_download
 3 | 
 4 | 
 5 | def oxford_comma_join(elements):
 6 |     try:
 7 |         if len(elements) == 1:
 8 |             return elements[0]
 9 |         elif len(elements) == 2:
10 |             return " and ".join(elements)
11 |         else:
12 |             return ", ".join(elements[:-1]) + ", and " + elements[-1]
13 |     except Exception:
14 |         return None
15 | 
16 | 
17 | def process():
18 |     file_train = hf_hub_download(
19 |         repo_id="kjappelbaum/chemnlp-uspto",
20 |         filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json",
21 |         repo_type="dataset",
22 |     )
23 |     df_train = pd.read_json(file_train)
24 |     df_train["split"] = "train"
25 | 
26 |     file_test = hf_hub_download(
27 |         repo_id="kjappelbaum/chemnlp-uspto",
28 |         filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json",
29 |         repo_type="dataset",
30 |     )
31 |     df_test = pd.read_json(file_test)
32 |     df_test["split"] = "test"
33 | 
34 |     file_valid = hf_hub_download(
35 |         repo_id="kjappelbaum/chemnlp-uspto",
36 |         filename="US_patents_1976-Sep2016_1product_reactions_test_prompts.json",
37 |         repo_type="dataset",
38 |     )
39 | 
40 |     df_valid = pd.read_json(file_valid)
41 |     df_valid["split"] = "valid"
42 | 
43 |     df = pd.concat([df_train, df_test, df_valid])
44 |     df = df.query("WithinTolerance == True")
45 |     df["yield"] = df["MeanYield"]
46 |     df["educt_string"] = df["educts"].apply(oxford_comma_join)
47 |     df["product_string"] = df["products"].apply(oxford_comma_join)
48 |     df["RXNSMILES"] = df["canonical_rxn_smiles"]
49 |     print(len(df))
50 |     df.to_csv("data_clean.csv", index=False)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     process()
55 | 


--------------------------------------------------------------------------------
/data/tabular/volume_of_distribution_at_steady_state_lombardo_et_al/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: volume_of_distribution_at_steady_state_lombardo_et_al
 2 | description: |-
 3 |   The volume of distribution at steady state (VDss) measures the degree
 4 |   of a drug's concentration in the body tissue compared to concentration in the blood.
 5 |   Higher VD indicates a higher distribution in the tissue and usually indicates
 6 |   the drug with high lipid solubility, low plasma protein binding rate.
 7 | targets:
 8 |   - id: VDss_Lombardo
 9 |     description: volume of distribution at steady state (VDss)
10 |     units: L/kg
11 |     type: continuous
12 |     names:
13 |       - noun: volume of distribution at steady state (VDss)
14 |       - noun: VDss
15 |     uris:
16 |       - http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#C85538
17 | benchmarks:
18 |   - name: TDC
19 |     link: https://tdcommons.ai/
20 |     split_column: split
21 | identifiers:
22 |   - id: SMILES
23 |     type: SMILES
24 |     description: SMILES
25 |   - id: compound_name
26 |     type: Other
27 |     names:
28 |       - noun: compound name
29 |       - noun: drug name
30 |       - noun: generic drug name
31 |     description: mix of drug name and ids
32 | license: CC BY 4.0
33 | links:
34 |   - url: https://doi.org/10.1021/acs.jcim.6b00044
35 |     description: corresponding publication
36 |   - url: https://tdcommons.ai/single_pred_tasks/adme/#vdss-volumn-of-distribution-at-steady-state-lombardo-et-al
37 |     description: data source
38 | num_points: 1130
39 | bibtex:
40 |   - |-
41 |     @article{Lombardo2016,
42 |     doi = {10.1021/acs.jcim.6b00044},
43 |     url = {https://doi.org/10.1021/acs.jcim.6b00044},
44 |     year = {2016},
45 |     month = sep,
46 |     publisher = {merican Chemical Society (ACS)},
47 |     volume = {56},
48 |     number = {10},
49 |     pages = {2042--2052},
50 |     author = {Franco Lombardo and Yankang Jing},
51 |     title = {In Silico Prediction of Volume of Distribution in Humans. Extensive Data Set and the
52 |     Exploration of Linear and Nonlinear Methods Coupled with Molecular Interaction Fields Descriptors},
53 |     journal = {Journal of Chemical Information and Modeling}
54 | 


--------------------------------------------------------------------------------
/data/tabular/zinc/meta.yaml:
--------------------------------------------------------------------------------
 1 | name: zinc
 2 | description: |-
 3 |   ZINC is a free database of commercially-available compounds for virtual screening.
 4 |   It contains over 230 million purchasable compounds in ready-to-dock, 3D formats.
 5 |   TDC uses a 250,000 sampled version from the original Mol-VAE paper.
 6 | identifiers:
 7 |   - id: SMILES
 8 |     type: SMILES
 9 |     description: SMILES
10 | license: |-
11 |   ZINC is free to use for everyone.
12 |   Redistribution of significant subsets requires written permission from the authors.
13 | links:
14 |   - url: https://pubs.acs.org/doi/full/10.1021/acs.jcim.5b00559
15 |     description: Article about original dataset
16 |   - url: https://pubs.acs.org/doi/abs/10.1021/acscentsci.7b00572
17 |     description: Exemplary related article shown in tdc's website
18 | num_points: 249455
19 | bibtex:
20 |   - |-
21 |     @article{doi:10.1021/acs.jcim.5b00559,
22 |     author = {Sterling, Teague and Irwin, John J.},
23 |     title = {ZINC 15 - Ligand Discovery for Everyone},
24 |     journal = {Journal of Chemical Information and Modeling},
25 |     volume = {55},
26 |     number = {11},
27 |     pages = {2324-2337},
28 |     year = {2015},
29 |     doi = {10.1021/acs.jcim.5b00559},
30 |     note ={PMID: 26479676},
31 |     URL = {https://doi.org/10.1021/acs.jcim.5b00559},
32 |     eprint = {https://doi.org/10.1021/acs.jcim.5b00559},
33 |     }
34 | 


--------------------------------------------------------------------------------
/data/text_sampling/get_dataset_overlap.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | 
 3 | import pandas as pd
 4 | 
 5 | skip_ds = [
 6 |     "rdkit_features",
 7 |     "iupac_smiles",
 8 |     "orbnet_denali",
 9 |     "qmof_gcmc",
10 |     "qmof_quantum",
11 |     "zinc",
12 | ]
13 | 
14 | if __name__ == "__main__":
15 |     path_base = __file__.replace("text_sampling/get_dataset_overlap.py", "")
16 |     fns = sorted(glob.glob(path_base + "tabular/**/data_clean.csv"))
17 |     for i in range(len(fns)):
18 |         for j in range(i + 1, len(fns)):
19 |             fn1 = fns[i]
20 |             fn2 = fns[j]
21 |             ds1 = fn1.split("/")[-2]
22 |             ds2 = fn2.split("/")[-2]
23 |             if (ds1 in skip_ds) or (ds2 in skip_ds):
24 |                 continue
25 |             df1 = pd.read_csv(
26 |                 fn1, index_col=False, low_memory=False, nrows=0
27 |             )  # only get columns
28 |             df2 = pd.read_csv(
29 |                 fn2, index_col=False, low_memory=False, nrows=0
30 |             )  # only get columns
31 |             if ("SMILES" in df1.columns) and ("SMILES" in df2.columns):
32 |                 df1 = pd.read_csv(
33 |                     fn1, index_col=False, low_memory=False, usecols=["SMILES"]
34 |                 )
35 |                 df2 = pd.read_csv(
36 |                     fn2, index_col=False, low_memory=False, usecols=["SMILES"]
37 |                 )
38 |                 print(
39 |                     fn1.split("/")[-2],
40 |                     fn2.split("/")[-2],
41 |                     len(set(df1.SMILES) & set(df2.SMILES)),
42 |                 )
43 | 


--------------------------------------------------------------------------------
/data/text_sampling/utils.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | 
 3 | 
 4 | def str_presenter(dumper, data: dict):
 5 |     """configures yaml for dumping multiline strings
 6 |     Ref: https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data
 7 |     """
 8 |     if data.count("\n") > 0:  # check for multiline string
 9 |         return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|")
10 |     return dumper.represent_scalar("tag:yaml.org,2002:str", data)
11 | 
12 | 
13 | def load_yaml(path: str) -> dict:
14 |     """Load yaml file from path."""
15 |     with open(path, "r") as stream:
16 |         try:
17 |             data = yaml.safe_load(stream)
18 |         except yaml.YAMLError as exc:
19 |             raise Exception(exc)
20 |     return data
21 | 


--------------------------------------------------------------------------------
/docs/api/meta_yaml_generator.md:
--------------------------------------------------------------------------------
 1 | # Meta YAML Generator
 2 | 
 3 | ## Overview
 4 | 
 5 | The Meta YAML Generator is a tool designed to automatically create a `meta.yaml` file for chemical datasets using Large Language Models (LLMs). It analyzes the structure of a given DataFrame and generates a comprehensive metadata file, including advanced sampling methods and template formats.
 6 | 
 7 | The model used by default is `gpt4o`. For using it, you need to expose the `OPENAI_API_KEY` environment variable.
 8 | 
 9 | ## `generate_meta_yaml`
10 | 
11 | ::: chemnlp.data.meta_yaml_generator.generate_meta_yaml
12 | handler: python
13 | options:
14 | show_root_heading: true
15 | show_source: false
16 | 
17 | ## Usage Example
18 | 
19 | ```python
20 | import pandas as pd
21 | from chemnlp.data.meta_yaml_generator import generate_meta_yaml
22 | 
23 | # Load your dataset
24 | df = pd.read_csv("your_dataset.csv")
25 | 
26 | # Generate meta.yaml
27 | meta_yaml = generate_meta_yaml(
28 |     df,
29 |     dataset_name="Polymer Properties Dataset",
30 |     description="A dataset of polymer properties including glass transition temperatures and densities",
31 |     output_path="path/to/save/meta.yaml"
32 | )
33 | 
34 | # The meta_yaml variable now contains the dictionary representation of the meta.yaml
35 | # If an output_path was provided, the meta.yaml file has been saved to that location
36 | ```
37 | 
38 | You can also use it as a command-line tool:
39 | 
40 | ```bash
41 | python -m chemnlp.data.meta_yaml_generator path/to/your_dataset.csv --dataset_name "Polymer Properties Dataset" --description "A dataset of polymer properties including glass transition temperatures and densities" --output_path "path/to/save/meta.yaml"
42 | ```
43 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # ChemNLP
2 | 
3 | ChemNLP is an effort to create the largest dataset of chemical data.
4 | We then use this dataset to train large language models (LLMs).
5 | 


--------------------------------------------------------------------------------
/experiments/ablations/20240814_sample_data.bash:
--------------------------------------------------------------------------------
 1 | # Without wrappers
 2 | # Benchmarking
 3 | chemnlp-sample data/tabular/lipophilicity sampled_benchmark/ --benchmarking --class_balanced
 4 | chemnlp-sample data/tabular/bicerano_dataset sampled_benchmark/ --benchmarking --class_balanced
 5 | chemnlp-sample data/tabular/opv sampled_benchmark/ --benchmarking --class_balanced
 6 | chemnlp-sample data/tabular/melting_points sampled_benchmark/ --benchmarking --class_balanced
 7 | chemnlp-sample data/tabular/bc5disease sampled_benchmark/ --benchmarking --class_balanced
 8 | chemnlp-sample data/tabular/MUV_846 sampled_benchmark/ --benchmarking --class_balanced
 9 | 
10 | # Default
11 | chemnlp-sample data/tabular/lipophilicity/ sampled --class_balanced
12 | chemnlp-sample data/tabular/bicerano_dataset/ sampled --class_balanced
13 | chemnlp-sample data/tabular/opv/ sampled --class_balanced
14 | chemnlp-sample data/tabular/melting_points/ sampled --class_balanced
15 | chemnlp-sample data/tabular/bc5disease/ sampled --class_balanced
16 | chemnlp-sample data/tabular/MUV_846/ sampled --class_balanced
17 | 
18 | 
19 | # With wrappers
20 | # Benchmarking
21 | chemnlp-sample data/tabular/lipophilicity/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced
22 | chemnlp-sample data/tabular/bicerano_dataset/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced
23 | chemnlp-sample data/tabular/opv/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced
24 | chemnlp-sample data/tabular/melting_points/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced
25 | chemnlp-sample data/tabular/bc5disease/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced
26 | chemnlp-sample data/tabular/MUV_846/ sampled_benchmark_wrapped/ --benchmarking --wrap-identifiers --class_balanced
27 | 
28 | # Default
29 | chemnlp-sample data/tabular/lipophilicity/ sampled_wrapped --wrap-identifiers --class_balanced
30 | chemnlp-sample data/tabular/bicerano_dataset/ sampled_wrapped --wrap-identifiers --class_balanced
31 | chemnlp-sample data/tabular/opv/ sampled_wrapped --wrap-identifiers --class_balanced
32 | chemnlp-sample data/tabular/melting_points/ sampled_wrapped --wrap-identifiers --class_balanced
33 | chemnlp-sample data/tabular/bc5disease/ sampled_wrapped --wrap-identifiers --class_balanced
34 | chemnlp-sample data/tabular/MUV_846/ sampled_wrapped --wrap-identifiers --class_balanced
35 | 


--------------------------------------------------------------------------------
/experiments/configs/data_configs/data_mixing.yml:
--------------------------------------------------------------------------------
1 | data_paths: ["/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/hendrycks_STEM", "/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv"]
2 | num_tokens: [2826240, 28262400]
3 | context_length: 2048
4 | save_path: "/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/2826240hendrycks_28262400chemrxiv"
5 | 


--------------------------------------------------------------------------------
/experiments/configs/data_configs/hf_data.yml:
--------------------------------------------------------------------------------
1 | model_name: "EleutherAI/pythia-1b"
2 | context_length: 2048
3 | dataset_name: "EleutherAI/pile"
4 | dataset_args: {"name": "pubmed", "split": "train"}
5 | batch_size: 1
6 | string_key: "text"
7 | save_path: "/fsx/proj-chemnlp/data/example_tokenised"
8 | 


--------------------------------------------------------------------------------
/experiments/configs/data_configs/hf_data_wiki.yml:
--------------------------------------------------------------------------------
1 | model_name: "EleutherAI/pythia-1b"
2 | context_length: 2048
3 | dataset_name: "wikipedia"
4 | dataset_args: {"name": "20220301.en", "split": "train", "beam_runner": "DirectRunner"}
5 | batch_size: 1000
6 | out_dir: "/fsx/proj-chemnlp/data"
7 | string_key: "text"
8 | 


--------------------------------------------------------------------------------
/experiments/configs/data_configs/prep_lm_eval_data.yml:
--------------------------------------------------------------------------------
1 | model_name: "EleutherAI/pythia-1b"
2 | context_length: 2048
3 | tasks: ["hendrycksTest-college_biology", "hendrycksTest-college_chemistry", "hendrycksTest-college_mathematics", "hendrycksTest-college_physics", "hendrycksTest-high_school_mathematics", "hendrycksTest-high_school_biology", "hendrycksTest-high_school_chemistry", "hendrycksTest-high_school_physics"]
4 | data_split: "validation"
5 | out_dir: "/fsx/proj-chemnlp/data"
6 | save_name: "hendrycks_STEM_2"
7 | 


--------------------------------------------------------------------------------
/experiments/configs/data_configs/prep_smiles_data.yml:
--------------------------------------------------------------------------------
1 | model_name: "EleutherAI/pythia-1b"
2 | context_length: 2048
3 | data_split: "train"
4 | out_dir: "/fsx/proj-chemnlp/data"
5 | save_name: "coconut_smiles"
6 | 


--------------------------------------------------------------------------------
/experiments/configs/deepspeed/deepspeed_S1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto"
 4 |   },
 5 |   "bf16": {
 6 |     "enabled": true
 7 |   },
 8 |   "zero_optimization": {
 9 |     "stage": 1,
10 |     "overlap_comm": true,
11 |     "contiguous_gradients": true,
12 |     "sub_group_size": 1e9,
13 |     "reduce_bucket_size": "auto"
14 |   },
15 |   "gradient_accumulation_steps": "auto",
16 |   "gradient_clipping": "auto",
17 |   "train_batch_size": "auto",
18 |   "train_micro_batch_size_per_gpu": "auto"
19 | }
20 | 


--------------------------------------------------------------------------------
/experiments/configs/deepspeed/deepspeed_S2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto"
 4 |   },
 5 |   "bf16": {
 6 |     "enabled": true
 7 |   },
 8 |   "zero_optimization": {
 9 |     "stage": 2,
10 |     "offload_optimizer": {
11 |       "device": "none",
12 |       "pin_memory": true
13 |     },
14 |     "overlap_comm": true,
15 |     "contiguous_gradients": true,
16 |     "sub_group_size": 1e9,
17 |     "reduce_bucket_size": "auto"
18 |   },
19 |   "gradient_accumulation_steps": "auto",
20 |   "gradient_clipping": "auto",
21 |   "train_batch_size": "auto",
22 |   "train_micro_batch_size_per_gpu": "auto"
23 | }
24 | 


--------------------------------------------------------------------------------
/experiments/configs/deepspeed/deepspeed_offload_S2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto"
 4 |   },
 5 |   "bf16": {
 6 |     "enabled": true
 7 |   },
 8 |   "zero_optimization": {
 9 |     "stage": 2,
10 |     "offload_optimizer": {
11 |       "device": "cpu",
12 |       "pin_memory": true
13 |     },
14 |     "overlap_comm": true,
15 |     "contiguous_gradients": true,
16 |     "sub_group_size": 1e9,
17 |     "reduce_bucket_size": "auto"
18 |   },
19 |   "gradient_accumulation_steps": "auto",
20 |   "gradient_clipping": "auto",
21 |   "train_batch_size": "auto",
22 |   "train_micro_batch_size_per_gpu": "auto"
23 | }
24 | 


--------------------------------------------------------------------------------
/experiments/configs/deepspeed/deepspeed_offload_S3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto"
 4 |   },
 5 |   "bf16": {
 6 |     "enabled": true
 7 |   },
 8 |   "zero_optimization": {
 9 |     "stage": 3,
10 |     "offload_optimizer": {
11 |       "device": "cpu",
12 |       "pin_memory": true
13 |     },
14 |     "offload_param": {
15 |       "device": "cpu",
16 |       "pin_memory": true
17 |     },
18 |     "overlap_comm": true,
19 |     "contiguous_gradients": true,
20 |     "sub_group_size": 1e9,
21 |     "reduce_bucket_size": "auto",
22 |     "stage3_prefetch_bucket_size": "auto",
23 |     "stage3_param_persistence_threshold": "auto",
24 |     "stage3_max_live_parameters": 1e9,
25 |     "stage3_max_reuse_distance": 1e9,
26 |     "stage3_gather_16bit_weights_on_model_save": true
27 |   },
28 |   "gradient_accumulation_steps": "auto",
29 |   "gradient_clipping": "auto",
30 |   "train_batch_size": "auto",
31 |   "train_micro_batch_size_per_gpu": "auto"
32 | }
33 | 


--------------------------------------------------------------------------------
/experiments/configs/eval_configs/default_eval_config.yaml:
--------------------------------------------------------------------------------
 1 | model: hf-causal
 2 | model_args: "pretrained=/fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_1b/checkpoint-final"
 3 | # model_args: "pretrained=EleutherAI/pythia-1b"
 4 | tasks: "hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_mathematics,hendrycksTest-college_physics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_physics"
 5 | batch_size: 12
 6 | device: "cuda:0"
 7 | wandb_log: true
 8 | wandb_project: LLCheM
 9 | wandb_group: evaluation
10 | wandb_run_name: 1B_fullfinetune_STEM
11 | wandb_entity: chemnlp
12 | 


--------------------------------------------------------------------------------
/experiments/configs/eval_configs/nlp_eval_config.yaml:
--------------------------------------------------------------------------------
 1 | model: hf-causal
 2 | model_args: "pretrained=/fsx/path/checkpoint" # update
 3 | tasks: "lambada_standard"
 4 | num_fewshot: 0
 5 | batch_size: 12
 6 | device: "cuda:0"
 7 | wandb_log: true
 8 | wandb_project: LLCheM
 9 | wandb_group: evaluation # update
10 | wandb_run_name: 1B_fullfinetune # update
11 | wandb_entity: chemnlp
12 | 


--------------------------------------------------------------------------------
/experiments/configs/eval_configs/safety_eval_config.yaml:
--------------------------------------------------------------------------------
 1 | model: hf-causal
 2 | model_args: "pretrained=/fsx/path/checkpoint" # update
 3 | tasks: "crows_pairs_english_race_color,crows_pairs_english_socioeconomic,crows_pairs_english_gender,crows_pairs_english_age,crows_pairs_english_religion,crows_pairs_english_disability,crows_pairs_english_sexual_orientation,crows_pairs_english_nationality,crows_pairs_english_physical_appearance"
 4 | num_fewshot: 0
 5 | batch_size: 12
 6 | device: "cuda:0"
 7 | wandb_log: true
 8 | wandb_project: LLCheM
 9 | wandb_group: evaluation # update
10 | wandb_run_name: 1B_fullfinetune # update
11 | wandb_entity: chemnlp
12 | 


--------------------------------------------------------------------------------
/experiments/configs/eval_configs/stem_eval_config.yaml:
--------------------------------------------------------------------------------
 1 | model: hf-causal
 2 | model_args: "pretrained=/fsx/path/checkpoint" # update
 3 | tasks: "pile_pubmed-abstracts,pile_pubmed-central,headqa_en,sciq,pubmedqa,is_smiles,complete_smiles,periodic_table,openbookqa,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_mathematics,hendrycksTest-college_physics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_physics"
 4 | num_fewshot: 0
 5 | batch_size: 12
 6 | device: "cuda:0"
 7 | wandb_log: true
 8 | wandb_project: LLCheM
 9 | wandb_group: evaluation # update
10 | wandb_run_name: 1B_fullfinetune # update
11 | wandb_entity: chemnlp
12 | 


--------------------------------------------------------------------------------
/experiments/configs/gpt-neox/cluster_setup.yml:
--------------------------------------------------------------------------------
1 | # Suggested data paths when using GPT-NeoX locally
2 | {
3 |   # see example configs for sampling options
4 |   "data-path": "/fsx/proj-chemnlp/data/marianna13/chemrxiv/data_text_document", "save": "/fsx/proj-chemnlp/experiments/checkpoints/finetuned/pythia-160M", "load": "/fsx/proj-chemnlp/experiments/checkpoints/pretrained/pythia-160M", "finetune": True, "checkpoint_validation_with_forward_pass": False, "log-dir": "/fsx/proj-chemnlp/experiments/logs", "log_interval": 100, "log_grad_pct_zeros": False, "log_param_norm": False, "log_grad_norm": False, "use_wandb": True, "wandb_host": "https://stability.wandb.io", "wandb_project": "LLCheM", "wandb_group": "Test Runs", "hostfile": "/mock_path", "num_gpus": 1}
5 | 


--------------------------------------------------------------------------------
/experiments/configs/gpt-neox/soft_prompt.yml:
--------------------------------------------------------------------------------
1 | {
2 |   # peft method settings
3 |   "soft_prompt_tuning": {"enabled": True, # also freezes all other parameters
4 |     "n_tokens": 10, "init_string": "", "init_range": 0.5}}
5 | 


--------------------------------------------------------------------------------
/experiments/configs/hugging-face/160M_full.yml:
--------------------------------------------------------------------------------
 1 | # Dataset configuration (datasets.load_from_disk arguments)
 2 | data:
 3 |   path: /fsx/proj-chemnlp/data/EleutherAI/pythia-160m/marianna13/chemrxiv
 4 | # Model configuration (model.from_pretrained arguments)
 5 | model:
 6 |   base: GPTNeoXForCausalLM
 7 |   name: EleutherAI/pythia-160m
 8 |   revision: main # latest model
 9 |   #checkpoint_path: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_160M/checkpoint-1600
10 | # Training strategies (PromptTuningConfig arguments)
11 | prompt_tuning:
12 |   enabled: false
13 | # Training configuration (TrainerArguments from HF)
14 | trainer:
15 |   output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_160M
16 |   num_train_epochs: 1
17 |   learning_rate: 3e-4
18 |   evaluation_strategy: steps
19 |   logging_steps: 50
20 |   eval_steps: 500
21 |   save_steps: 1000
22 |   dataloader_num_workers: 4
23 |   bf16: true
24 |   fp16: false
25 |   per_device_train_batch_size: 4
26 |   per_device_eval_batch_size: 4
27 | # Logging configuration (WandB init arguments)
28 | wandb:
29 |   enabled: true
30 |   project: LLCheM
31 |   group: test
32 |   name: test_160M_full_v2 # full_160M_v1
33 |   entity: chemnlp
34 | 


--------------------------------------------------------------------------------
/experiments/configs/hugging-face/160M_ptune.yml:
--------------------------------------------------------------------------------
 1 | # Dataset configuration (datasets.load_from_disk arguments)
 2 | data:
 3 |   path: /fsx/proj-chemnlp/data/EleutherAI/pythia-160m/marianna13/chemrxiv
 4 | # Model configuration (model.from_pretrained arguments)
 5 | model:
 6 |   base: GPTNeoXForCausalLM
 7 |   name: EleutherAI/pythia-160m
 8 |   revision: main # latest model
 9 | # Training strategies (PromptTuningConfig arguments)
10 | prompt_tuning:
11 |   enabled: true
12 |   num_virtual_tokens: 10
13 |   prompt_tuning_init_text: " "
14 | # Training configuration (TrainerArguments from HF)
15 | trainer:
16 |   output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/160M
17 |   num_train_epochs: 1
18 |   learning_rate: 3e-4
19 |   evaluation_strategy: steps
20 |   logging_steps: 5
21 |   eval_steps: 50
22 |   save_steps: 200
23 |   dataloader_num_workers: 4
24 |   bf16: true
25 |   fp16: false
26 |   per_device_train_batch_size: 30
27 |   per_device_eval_batch_size: 30
28 | # Logging configuration (WandB init arguments)
29 | wandb:
30 |   enabled: true
31 |   project: LLCheM
32 |   group: test
33 |   name: peft_160M_v1
34 | 


--------------------------------------------------------------------------------
/experiments/configs/hugging-face/1B_fine_tune.yml:
--------------------------------------------------------------------------------
 1 | # Dataset configuration (datasets.load_from_disk arguments)
 2 | data:
 3 |   path: /fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv
 4 | # Model configuration (model.from_pretrained arguments)
 5 | model:
 6 |   base: GPTNeoXForCausalLM
 7 |   name: EleutherAI/pythia-1b
 8 |   revision: main # latest model
 9 | # Training strategies (PromptTuningConfig arguments)
10 | prompt_tuning:
11 |   enabled: false
12 | # Training configuration (TrainerArguments from HF)
13 | trainer:
14 |   output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_1b
15 |   num_train_epochs: 1
16 |   learning_rate: 3e-4
17 |   evaluation_strategy: steps
18 |   logging_steps: 50
19 |   eval_steps: 500
20 |   save_steps: 1000
21 |   dataloader_num_workers: 4
22 |   bf16: true
23 |   fp16: false
24 |   per_device_train_batch_size: 2
25 |   per_device_eval_batch_size: 8
26 | # Logging configuration (WandB init arguments)
27 | wandb:
28 |   enabled: true
29 |   project: LLCheM
30 |   group: test
31 |   name: test_1b_fine_tune
32 | 


--------------------------------------------------------------------------------
/experiments/configs/hugging-face/3B_fine_tune.yml:
--------------------------------------------------------------------------------
 1 | # Dataset configuration (datasets.load_from_disk arguments)
 2 | data:
 3 |   path: /fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv
 4 | # Model configuration (model.from_pretrained arguments)
 5 | model:
 6 |   base: GPTNeoXForCausalLM
 7 |   name: EleutherAI/pythia-2.8b
 8 |   revision: main # latest model
 9 | # Training strategies (PromptTuningConfig arguments)
10 | prompt_tuning:
11 |   enabled: false
12 | # Training configuration (TrainerArguments from HF)
13 | trainer:
14 |   output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_3b
15 |   num_train_epochs: 1
16 |   learning_rate: 3e-4
17 |   evaluation_strategy: steps
18 |   logging_steps: 10
19 |   eval_steps: 50
20 |   save_steps: 500
21 |   dataloader_num_workers: 4
22 |   bf16: true
23 |   fp16: false
24 |   per_device_train_batch_size: 8
25 |   per_device_eval_batch_size: 1
26 |   gradient_checkpointing: True
27 |   deepspeed_config: deepspeed_S2.json
28 | # Logging configuration (WandB init arguments)
29 | wandb:
30 |   enabled: true
31 |   project: LLCheM
32 |   group: 3B_deepspeed
33 |   name: 3B_fine_tune
34 | 


--------------------------------------------------------------------------------
/experiments/configs/hugging-face/410M_fine_tune.yml:
--------------------------------------------------------------------------------
 1 | # Dataset configuration (datasets.load_from_disk arguments)
 2 | data:
 3 |   path: /fsx/proj-chemnlp/data/EleutherAI/pythia-410m/marianna13/chemrxiv
 4 | # Model configuration (model.from_pretrained arguments)
 5 | model:
 6 |   base: GPTNeoXForCausalLM
 7 |   name: EleutherAI/pythia-410M
 8 |   revision: main # latest model
 9 | # Training strategies (PromptTuningConfig arguments)
10 | prompt_tuning:
11 |   enabled: false
12 | # Training configuration (TrainerArguments from HF)
13 | trainer:
14 |   output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_410M
15 |   num_train_epochs: 1
16 |   learning_rate: 3e-4
17 |   evaluation_strategy: steps
18 |   logging_steps: 50
19 |   eval_steps: 500
20 |   save_steps: 1000
21 |   dataloader_num_workers: 4
22 |   bf16: true
23 |   fp16: false
24 |   per_device_train_batch_size: 2
25 |   per_device_eval_batch_size: 2
26 | # Logging configuration (WandB init arguments)
27 | wandb:
28 |   enabled: true
29 |   project: LLCheM
30 |   group: test
31 |   name: test_410M_fine_tune_2
32 |   entity: chemnlp
33 | 


--------------------------------------------------------------------------------
/experiments/configs/hugging-face/7B_fine_tune.yml:
--------------------------------------------------------------------------------
 1 | # Dataset configuration (datasets.load_from_disk arguments)
 2 | data:
 3 |   path: /fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv
 4 | # Model configuration (model.from_pretrained arguments)
 5 | model:
 6 |   base: GPTNeoXForCausalLM
 7 |   name: EleutherAI/pythia-6.9b
 8 |   revision: main # latest model
 9 | # Training strategies (PromptTuningConfig arguments)
10 | prompt_tuning:
11 |   enabled: false
12 | # Training configuration (TrainerArguments from HF)
13 | trainer:
14 |   output_dir: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_7b_test
15 |   num_train_epochs: 1
16 |   learning_rate: 3e-4
17 |   evaluation_strategy: steps
18 |   logging_steps: 10
19 |   eval_steps: 50
20 |   save_steps: 500
21 |   dataloader_num_workers: 4
22 |   bf16: true
23 |   fp16: false
24 |   per_device_train_batch_size: 10
25 |   per_device_eval_batch_size: 1
26 |   gradient_checkpointing: True
27 |   deepspeed_config: deepspeed_offload_S3.json
28 | # Logging configuration (WandB init arguments)
29 | wandb:
30 |   enabled: true
31 |   project: LLCheM
32 |   group: 7B_deepspeed
33 |   name: 7B_fine_tune_test
34 | 


--------------------------------------------------------------------------------
/experiments/data/merge_epmc_to_jsonl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file is specifically for taking a nested folder of jsonlines files
 3 | and merging them into one complete jsonlines file.
 4 | 
 5 | e.g.
 6 | <dir>/2022_05_27/file1.jsonl
 7 | <dir>/2022_05_25/file2.jsonl
 8 | ...
 9 | """
10 | 
11 | import multiprocessing
12 | import os
13 | from typing import List
14 | 
15 | import jsonlines
16 | from tqdm import tqdm
17 | 
18 | # NOTE hardcoded paths
19 | ROOT = "/fsx/proj-chemnlp/jeb/europmc_deduped"
20 | OUT_DIR = "/fsx/proj-chemnlp/jeb/europmc_deduped/merged_paper_and_abstracts"
21 | 
22 | 
23 | def write_to_merged_file(all_files: List[str], name: str):
24 |     """Loops over all_files, reads them and writes them to the out_file"""
25 |     out_file = f"{OUT_DIR}/merged_all_{name}.jsonl"
26 |     print(f"Writing {len(all_files)} {name} files to {out_file}")
27 |     # start context manager for writing
28 |     with jsonlines.open(out_file, mode="w") as writer:
29 |         for file in tqdm(all_files):
30 |             # as writing is serial, this cannot be easily parallelised
31 |             with jsonlines.open(file) as reader:
32 |                 all_entries = [*reader]
33 |                 writer.write_all(all_entries)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     # collect all files
38 |     result = os.popen(f"find {ROOT} -type f -name '*.jsonl'")
39 |     parsed_result = result.read().split("\n")
40 |     all_files = [x for x in parsed_result if x]
41 |     paper_files = [x for x in all_files if "ft" in x]
42 |     abstract_files = [x for x in all_files if "abs" in x]
43 |     print(
44 |         f"{len(all_files)} files, {len(paper_files)} paper and {len(abstract_files)} abstract files."
45 |     )
46 | 
47 |     # merge and process
48 |     with multiprocessing.Pool(os.cpu_count()) as p:
49 |         p.starmap(
50 |             write_to_merged_file,
51 |             [(paper_files, "papers"), (abstract_files, "abstracts")],
52 |         )
53 | 


--------------------------------------------------------------------------------
/experiments/data/prepare_gptneox_chemrxiv.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Preparing chemrxiv dataset as per GPT-NeoX guidelines
 3 | NOTE this needs to be run from the root of this repository directory
 4 | 
 5 | Example usage:
 6 |     python experiments/chem_data_prep.py /fsx/proj-chemnlp/data/ chemnlp/gpt-neox/
 7 | """
 8 | 
 9 | import argparse
10 | import os
11 | 
12 | import datasets
13 | import jsonlines
14 | 
15 | DATASET = "marianna13/chemrxiv"
16 | GPT_NEOX_KEY = "text"
17 | 
18 | if __name__ == "__main__":
19 |     # parse args
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument(
22 |         "save_dir", help="Where you want to store the prepared dataset."
23 |     )
24 |     parser.add_argument(
25 |         "gptneox_dir", help="Where you can find the GPT-NeoX repository."
26 |     )
27 |     args = parser.parse_args()
28 | 
29 |     # save initial strings from chemrxiv articles as jsonlines
30 |     chem_data = datasets.load_dataset(DATASET)
31 |     all_full_text_samples = [
32 |         {GPT_NEOX_KEY: paper["TEXT"]} for paper in chem_data["train"]
33 |     ]
34 |     save_path = f"{args.save_dir}/{DATASET}"
35 |     data_path = f"{save_path}/data.jsonl"
36 |     os.makedirs(save_path, exist_ok=True)
37 |     with jsonlines.open(data_path, "w") as writer:
38 |         writer.write_all(all_full_text_samples)
39 | 
40 |     # execute gpt-neox processing
41 |     gpt_tool_path = f"{args.gptneox_dir}/tools/preprocess_data.py"
42 |     os.system(
43 |         f"""
44 |         python {gpt_tool_path}
45 |         --input {data_path}
46 |         --output-prefix {save_path}/data
47 |         --vocab /fsx/pile/20B_tokenizer.json
48 |         --dataset-impl mmap
49 |         --tokenizer-type HFTokenizer --append-eod
50 |         """
51 |     )
52 | 


--------------------------------------------------------------------------------
/experiments/data/sbatch_hf_dataset.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="llched-tokenise"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/tokenise_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/tokenise_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ### This script runs tokenisation of Hugging Face datasets
14 | ### The first arg ($1) is the prefix directory where the environment is saved
15 | ### The second arg ($2) is the directory to use when building the environment
16 | ### The third arg ($3) is the full path to the tokenisation configuration file
17 | 
18 | set -ex # allow for exiting based on non-0 codes
19 | 
20 | # set workdir
21 | cd /fsx/proj-chemnlp/$2/chemnlp
22 | 
23 | # create environment
24 | source experiments/scripts/env_creation_hf.sh $1 $2
25 | pip install ".[tokenisation]"
26 | 
27 | # trigger run
28 | python experiments/data/prepare_hf_dataset.py $3
29 | 


--------------------------------------------------------------------------------
/experiments/data/sbatch_hf_split.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="llched-split"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/split_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/split_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ### This script runs tokenisation of Hugging Face datasets
14 | ### The first arg ($1) is the prefix directory where the environment is saved
15 | ### The second arg ($2) is the directory to use when building the environment
16 | ### The third arg ($3) is the path to the raw jsonlines file
17 | ### The fourth arg ($4) is the fractional size of the test / validation sets
18 | 
19 | set -ex # allow for exiting based on non-0 codes
20 | 
21 | # set workdir
22 | cd /fsx/proj-chemnlp/$2/chemnlp
23 | 
24 | # create environment
25 | source experiments/scripts/env_creation_hf.sh $1 $2
26 | 
27 | # trigger run
28 | python experiments/data/split_data.py $3 $4
29 | 


--------------------------------------------------------------------------------
/experiments/data/sbatch_merge_epmc_jsonl.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="merge-epmc"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/data_merge_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/data_merge_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ### The first arg ($1) is the prefix directory where the environment is saved
14 | ### The second arg ($2) is the directory to use when building the environment
15 | 
16 | set -ex # allow for exiting based on non-0 codes
17 | 
18 | # set workdir
19 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp
20 | 
21 | # create environment
22 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2
23 | 
24 | # trigger run
25 | cd $CHEMNLP_PATH
26 | python experiments/data/merge_epmc_to_jsonl.py
27 | 


--------------------------------------------------------------------------------
/experiments/scripts/env_creation_hf.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | ### This script creates a conda environment for chemnlp
 3 | ### The first arg ($1) is the prefix directory where the environment is saved
 4 | ### The second arg ($2) is the directory to use when building the environment
 5 | 
 6 | ## Must already have miniconda installed!
 7 | export CONDA_ENV_PATH=/fsx/proj-chemnlp/$1/conda/env/chemnlp-hf
 8 | export PYTHON_VER=3.8
 9 | CUDA_VERSION=11.7
10 | CONDA_BASE=$(conda info --base)
11 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp
12 | 
13 | ## ensure we can use activate syntax in slurm scripts
14 | source $CONDA_BASE/etc/profile.d/conda.sh
15 | 
16 | # Create Python environment through conda
17 | if [ -d "${CONDA_ENV_PATH}" ]
18 | then
19 |     # if already exists activate
20 |     echo "Found ${CONDA_ENV_PATH} in the directory, activating it!"
21 |     conda activate ${CONDA_ENV_PATH}
22 | else
23 |     # otherwise create new env (race conditions exist)
24 |     echo "Creating ${CONDA_ENV_PATH} environment!"
25 |     conda create --force --prefix ${CONDA_ENV_PATH} python=${PYTHON_VER} -y
26 |     conda activate ${CONDA_ENV_PATH}
27 | 
28 |     ## clone + submodules (ok if exists)
29 |     cd /fsx/proj-chemnlp/$2
30 |     [ ! -d 'chemnlp' ] && git clone --recurse-submodules git@github.com:OpenBioML/chemnlp.git
31 | 
32 |     ## install core requirements
33 |     conda install -y pytorch torchvision torchaudio pytorch-cuda=${CUDA_VERSION} -c pytorch -c nvidia --verbose
34 |     cd $CHEMNLP_PATH
35 |     pip install ".[training]"
36 |     pip install lm-evaluation-harness/
37 | fi
38 | 


--------------------------------------------------------------------------------
/experiments/scripts/env_creation_neox.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | ### This script creates a conda environment for chemnlp
 3 | ### The first arg ($1) is the prefix directory where the environment is saved
 4 | ### The second arg ($2) is the directory to use when building the environment
 5 | 
 6 | ## Must already have miniconda installed!
 7 | export CONDA_ENV_PATH=/fsx/proj-chemnlp/$1/conda/env/chemnlp-neox
 8 | export PYTHON_VER=3.8
 9 | 
10 | ## ensure we can use activate syntax in slurm scripts
11 | CONDA_BASE=$(conda info --base)
12 | source $CONDA_BASE/etc/profile.d/conda.sh
13 | 
14 | # Create Python environment through conda
15 | if [ -d "${CONDA_ENV_PATH}" ]; then rm -Rf ${CONDA_ENV_PATH}; fi
16 | conda create --force --prefix ${CONDA_ENV_PATH} python=${PYTHON_VER} -y
17 | conda activate ${CONDA_ENV_PATH}
18 | 
19 | # Python requirements
20 | ## cd into your directory inside of proj-chemnlp
21 | cd /fsx/proj-chemnlp/$2
22 | 
23 | ## clone + submodules (ok if exists)
24 | [ ! -d 'chemnlp' ] && git clone --recurse-submodules git@github.com:OpenBioML/chemnlp.git
25 | 
26 | ## install
27 | cd chemnlp/gpt-neox
28 | pip install -r requirements/requirements.txt # base gpt-neox reqs
29 | pip install -r requirements/requirements-wandb.txt # add wand monitoring reqs
30 | 
31 | ## downgrades / pins
32 | pip install protobuf=="3.20"
33 | pip install numpy=="1.23"
34 | 


--------------------------------------------------------------------------------
/experiments/scripts/eval_create_batch_configs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import yaml
 5 | from lm_eval import config
 6 | 
 7 | CHECKPOINT_DIR = "checkpoint-final"
 8 | 
 9 | 
10 | def run(
11 |     config_path: str,
12 |     root_models_path: str,
13 | ):
14 |     raw_config = config.load_config(config_path)
15 | 
16 |     model_names = [
17 |         name
18 |         for name in os.listdir(root_models_path)
19 |         if os.path.isdir(os.path.join(root_models_path, name))
20 |     ]
21 | 
22 |     for model_name in model_names:
23 |         raw_config["model_args"] = (
24 |             f"pretrained={root_models_path}/{model_name}/{CHECKPOINT_DIR}"
25 |         )
26 |         raw_config["wandb_run_name"] = model_name
27 | 
28 |         with open(
29 |             f"{root_models_path}/{model_name}/eval_config.yml", "w"
30 |         ) as new_config:
31 |             yaml.dump(raw_config, new_config)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument(
37 |         "config_path", help="The full path to the example YAML config file."
38 |     )
39 |     parser.add_argument(
40 |         "root_models_path",
41 |         help="The full path to the parent directory containing models.",
42 |     )
43 |     args = parser.parse_args()
44 |     run(args.config_path, args.root_models_path)
45 | 


--------------------------------------------------------------------------------
/experiments/scripts/miniconda_install.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | cd ~
4 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
5 | bash Miniconda3-latest-Linux-x86_64.sh # Follow instructions, accept all conditions blindly
6 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_eval.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="chemtest"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/eval_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/eval_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ### This script runs lm_eval2 experiments
14 | ### The first arg ($1) is the prefix directory where the environment is saved
15 | ### The second arg ($2) is the directory to use when building the environment
16 | ### The third arg ($3) is the name of the eval config.yaml file
17 | 
18 | set -ex # allow for exiting based on non-0 codes
19 | overrides=${4:-'{}'}
20 | # set workdir
21 | CHEMNLP_PATH=/fsx/proj-chemnlp/$1/chemnlp
22 | 
23 | # create environment
24 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2
25 | 
26 | export TOKENIZERS_PARALLELISM=false
27 | 
28 | # trigger run
29 | cd $CHEMNLP_PATH/lm-evaluation-harness
30 | python main_eval.py $3 --config_overrides $overrides
31 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_eval_batch.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="chemtest"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/batch_eval_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/batch_eval_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ### This script runs lm_eval2 experiments
14 | ### The first arg ($1) is the prefix directory where the environment is saved
15 | ### The second arg ($2) is the directory to use when building the environment
16 | ### The third arg ($3) is the name of the default eval config.yaml file
17 | ### The fourth arg ($4) is the path to the parent file containing the models to evaluate
18 | 
19 | set -ex # allow for exiting based on non-0 codes
20 | 
21 | # set workdir
22 | CHEMNLP_PATH=/fsx/proj-chemnlp/$1/chemnlp
23 | 
24 | # create environment
25 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2
26 | 
27 | # create experiment config for each model
28 | python $CHEMNLP_PATH/experiments/scripts/eval_create_batch_configs.py $3 $4
29 | 
30 | # evaluate each model
31 | for entry in $4/*/
32 | do
33 |   sbatch $CHEMNLP_PATH/experiments/scripts/run_eval.sh $1 $2 "$entry"eval_config.yml
34 |   sleep 1
35 | done
36 | 


--------------------------------------------------------------------------------
/experiments/scripts/run_grid_search.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | 
 4 | from chemnlp.data_val.config import GridSearch
 5 | from chemnlp.utils import _get_all_combinations
 6 | 
 7 | # User-defined parameters
 8 | MULTINODE_RUNS = False
 9 | SBATCH_SCRIPT = (
10 |     "experiments/scripts/sbatch_train_hf_multinode.sh"
11 |     if MULTINODE_RUNS
12 |     else "experiments/scripts/sbatch_train_hf.sh"
13 | )
14 | 
15 | WANDB_GRID_GROUPNAME = "test-grid-search-singlenode"
16 | CONDA_ENV = "experiments/training-env"
17 | CHEMNLP_FOLDER = "jack"
18 | 
19 | BASE_CONFIGS = ["1B_fine_tune.yml"]  # , "3B_fine_tune.yml"]
20 | GRID_PARAMETERS = GridSearch(
21 |     data={"path": ["/fsx/proj-chemnlp/data/EleutherAI/pythia-1b/marianna13/chemrxiv"]},
22 |     trainer={"learning_rate": [3e-4, 3e-3], "lr_scheduler_type": ["linear", "cosine"]},
23 | )
24 | 
25 | if __name__ == "__main__":
26 |     # Job submission loop
27 |     for config_path in BASE_CONFIGS:
28 |         # for each base configuration
29 |         config_name = config_path.split(".")[0]
30 |         all_possible_hyperparams = _get_all_combinations(GRID_PARAMETERS.dict())
31 | 
32 |         for i, overriding_params in enumerate(all_possible_hyperparams):
33 |             # set checkpoint dir & wandb run name
34 |             run_name = f"{config_name}_{i}"
35 |             overriding_params["wandb"]["name"] = run_name
36 |             overriding_params["wandb"]["group"] = WANDB_GRID_GROUPNAME
37 |             overriding_params["trainer"][
38 |                 "output_dir"
39 |             ] = f"/fsx/proj-chemnlp/experiments/checkpoints/finetuned/{WANDB_GRID_GROUPNAME}/{run_name}"
40 |             # remove spaces for bash
41 |             overriding_json = f"'{json.dumps(overriding_params)}'".replace(" ", "")
42 | 
43 |             # submit every combination of grid search parameters
44 |             cmd = f"sbatch {SBATCH_SCRIPT} {CONDA_ENV} {CHEMNLP_FOLDER} {config_path} {overriding_json}"
45 |             subprocess.run(cmd, shell=True)
46 | 


--------------------------------------------------------------------------------
/experiments/scripts/sbatch_train_hf.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="llchem-singlenode"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/training_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/training_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ### This script runs a GPT-NeoX experiments
14 | ### The first arg ($1) is the prefix directory where the environment is saved
15 | ### The second arg ($2) is the directory to use when building the environment
16 | ### The third arg ($3) is the name of the base training config
17 | ### The fourth arg ($4) is an optional json of any overriding configuration values
18 | 
19 | set -ex # allow for exiting based on non-0 codes
20 | export TOKENIZERS_PARALLELISM=false
21 | export WANDB_BASE_URL="https://stability.wandb.io"
22 | overrides=${4:-'{}'}
23 | 
24 | # set workdir
25 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp
26 | 
27 | # create environment
28 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2
29 | 
30 | # trigger run
31 | cd $CHEMNLP_PATH
32 | torchrun --standalone --nnodes 1 --nproc-per-node 8 \
33 |     experiments/scripts/run_tune.py experiments/configs/hugging-face/$3 --config_overrides $overrides
34 | 


--------------------------------------------------------------------------------
/experiments/scripts/sbatch_train_hf_multinode.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="llchem-multinode"
 3 | #SBATCH --nodes=4
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/training_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/training_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ### This script runs a GPT-NeoX experiments
14 | ### The first arg ($1) is the prefix directory where the environment is saved
15 | ### The second arg ($2) is the directory to use when building the environment
16 | ### The third arg ($3) is the name of the base training config
17 | ### The fourth arg ($4) is an optional json of any overriding configuration values
18 | 
19 | set -ex # allow for exiting based on non-0 codes
20 | export TOKENIZERS_PARALLELISM=false
21 | export WANDB_BASE_URL="https://stability.wandb.io"
22 | export NCCL_DEBUG=INFO
23 | export NCCL_ASYNC_ERROR_HANDLING=1
24 | export LOGLEVEL=INFO
25 | overrides=${4:-'{}'}
26 | 
27 | # set workdir
28 | CHEMNLP_PATH=/fsx/proj-chemnlp/$2/chemnlp
29 | 
30 | # create environment
31 | source $CHEMNLP_PATH/experiments/scripts/env_creation_hf.sh $1 $2
32 | 
33 | # Get multinode information
34 | nodes=( $( scontrol show hostnames $SLURM_JOB_NODELIST ) )
35 | nodes_array=($nodes)
36 | head_node=${nodes_array[0]}
37 | head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
38 | echo Node IP: $head_node_ip
39 | 
40 | # Run script
41 | srun torchrun --nnodes $SLURM_NNODES --nproc_per_node 8 \
42 | --rdzv_id $RANDOM \
43 | --rdzv_backend c10d \
44 | --rdzv_endpoint $head_node_ip:29500 \
45 | experiments/scripts/run_tune.py  experiments/configs/hugging-face/$3 --config_overrides $overrides
46 | 


--------------------------------------------------------------------------------
/experiments/scripts/transfer_all_checkpoint_to_s3.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="llchem-transfer-batch"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/transfer_batch_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/transfer_batch_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ## This script recursively copies a directory to S3 storage
14 | ### The first arg ($1) is the full path to a folder (i.e. <....>/1B_experiments/)
15 | ### The second arg ($2) is the S3 bucket to copy to (i.e. llchem-models)
16 | ### The third argument is the directory inside proj-chemnlp to find chemnlp
17 | 
18 | EC2_AVAIL_ZONE=`curl -s http://169.254.169.254/latest/meta-data/placement/availability-zone`
19 | EC2_REGION="`echo \"$EC2_AVAIL_ZONE\" | sed 's/[a-z]$//'`"
20 | CHEMNLP_PATH=/fsx/proj-chemnlp/$3/chemnlp
21 | CHECKPOINT_DIR=/fsx/proj-chemnlp/experiments/checkpoints
22 | 
23 | echo "Finding checkpoints in $1"
24 | all_checkpoints=( $(find $1 -name "checkpoint-*" -type d) )
25 | 
26 | echo "Saving checkpoints to region: $EC2_REGION"
27 | for chkpt in ${all_checkpoints[@]}
28 | do
29 |     sbatch $CHEMNLP_PATH/experiments/scripts/transfer_checkpoint_to_s3.sh $chkpt $2
30 |     sleep 1
31 | done
32 | 


--------------------------------------------------------------------------------
/experiments/scripts/transfer_checkpoint_to_s3.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | #SBATCH --job-name="llchem-transfer"
 3 | #SBATCH --nodes=1
 4 | #SBATCH --ntasks-per-node=1
 5 | #SBATCH --cpus-per-task=12
 6 | #SBATCH --output=/fsx/proj-chemnlp/experiments/logs/transfer_%j.out
 7 | #SBATCH --error=/fsx/proj-chemnlp/experiments/logs/transfer_%j.err
 8 | #SBATCH --open-mode=append
 9 | #SBATCH --account=topchem
10 | #SBATCH --partition=g40x
11 | #SBATCH --exclusive
12 | 
13 | ## This script recursively copies a directory to S3 storage
14 | ### The first arg ($1) is a full path to a checkpoint folder (i.e. <....>/checkpoint-1000)
15 | ### The second arg ($2) is the S3 bucket to copy to (i.e. llchem-models)
16 | 
17 | cutat=checkpoints/
18 | TARGET_DIR=$(echo $1 | awk -F $cutat '{print $2}') # turns /a/b/checkpoints/c/d/ -> c/d/
19 | PARENT_DIR="$(dirname "$1")"
20 | CHILD_FILE="$(basename "$1")"
21 | 
22 | if [ ! -f "$1.tar" ]; then
23 |     cd $PARENT_DIR && tar -cvf $CHILD_FILE.tar $CHILD_FILE
24 | fi
25 | 
26 | aws s3 cp $1.tar s3://$2/$TARGET_DIR.tar
27 | 


--------------------------------------------------------------------------------
/experiments/scripts/transfer_hf_cache.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | hf_subdirs=( accelerate datasets hub )
3 | for sub_dir in "${hf_subdirs[@]}"
4 | do
5 |    cp -R ~/.cache/huggingface/$sub_dir/* /fsx/proj-chemnlp/hf_cache/$sub_dir
6 |    rm -rf ~/.cache/huggingface/$sub_dir
7 | done
8 | 


--------------------------------------------------------------------------------
/experiments/working/calculate_nll.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from datasets import load_dataset
 3 | from tqdm import tqdm
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer
 5 | 
 6 | # See: https://huggingface.co/docs/transformers/perplexity
 7 | 
 8 | MAX_LENGTH = 2048
 9 | STRIDE = 512
10 | BASE_STRING = "EleutherAI/pythia-"
11 | PYTHIA_MODELS = ["70m", "160m", "410m", "1b", "1.4b", "2.8b"]
12 | MODELS = [BASE_STRING + x for x in PYTHIA_MODELS]
13 | print(f"Running models: {MODELS}")
14 | 
15 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16 | 
17 | dataset = load_dataset("aeslc")["test"]
18 | print(f"Loaded dataset, size: {len(dataset)}")
19 | 
20 | results = {k: None for k in MODELS}
21 | 
22 | for model_name in MODELS:
23 |     print(f"Starting model: {model_name}")
24 |     model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
25 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
26 | 
27 |     # Join dataset into one long document:
28 |     tokenized_dataset = tokenizer(
29 |         "\n\n".join(dataset["email_body"]), return_tensors="pt"
30 |     )
31 |     seq_len = tokenized_dataset.input_ids.size(1)
32 |     prev_end_loc = 0
33 |     nlls = []
34 | 
35 |     for begin_loc in tqdm(range(0, seq_len, STRIDE)):
36 |         end_loc = min(begin_loc + MAX_LENGTH, seq_len)
37 |         trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
38 | 
39 |         # Get data:
40 |         input_ids = tokenized_dataset.input_ids[:, begin_loc:end_loc].to(DEVICE)
41 |         target_ids = input_ids.clone()
42 |         target_ids[:, :-trg_len] = -100  # set all but last trg_len tokens to -100
43 | 
44 |         # Forward pass;
45 |         with torch.no_grad():
46 |             outputs = model(input_ids, labels=target_ids)
47 | 
48 |             # loss is calculated using CrossEntropyLoss which averages over input tokens.
49 |             # Multiply it with trg_len to get the summation instead of average.
50 |             # We will take average over all the tokens to get the true average
51 |             # in the last step of this example.
52 |             neg_log_likelihood = outputs.loss * trg_len
53 | 
54 |         nlls.append(neg_log_likelihood)
55 | 
56 |         prev_end_loc = end_loc
57 |         if end_loc >= seq_len:
58 |             break
59 | 
60 |     results[model_name] = torch.stack(nlls).sum() / end_loc
61 |     print(results)
62 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: ChemNLP Documentation
 2 | theme:
 3 |   name: material
 4 | nav:
 5 |   - Home: index.md
 6 |   - User Guide:
 7 |       - Installation: user-guide/installation.md
 8 |       - Quick Start: user-guide/quickstart.md
 9 |   - API Reference:
10 |       - Sampler Module: api/sampler.md
11 |       - Sampler CLI: api/sampler_cli.md
12 |       - Meta YAML Generator: api/meta_yaml_generator.md
13 |       - Meta YAML Augmentor: api/meta_yaml_augmentor.md
14 |   - Examples:
15 |       - Basic Usage: examples/basic-usage.md
16 |       - Advanced Techniques: examples/advanced-techniques.md
17 |   - Contributing: CONTRIBUTING.md
18 |   - Changelog: changelog.md
19 | markdown_extensions:
20 |   - pymdownx.highlight
21 |   - pymdownx.superfences
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "setuptools-scm"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "chemnlp"
 7 | description = "Open source chemistry dataset & LLM"
 8 | readme = "README.md"
 9 | requires-python = "==3.9.*"
10 | dependencies = ["pandas", "pydantic", "pydantic_yaml<=0.11.2", "fire", "loguru"]
11 | dynamic = ["version"]
12 | 
13 | [project.optional-dependencies]
14 | dev = ["pre-commit", "pytest"]
15 | dataset_creation = [
16 |   "PyTDC",
17 |   "rdkit",
18 |   "ruamel.yaml",
19 |   "selfies",
20 |   "deepsmiles",
21 |   "pubchempy",
22 |   "bioc",
23 |   "pylatexenc",
24 |   "canonicalize_psmiles@git+https://github.com/Ramprasad-Group/canonicalize_psmiles.git",
25 |   "rxn-chem-utils",
26 |   "backoff",
27 |   "givemeconformer",
28 |   "chembl_webresource_client",
29 |   "dask",
30 |   "pandarallel"
31 | ]
32 | 
33 | [project.scripts]
34 | chemnlp-generate-meta = "chemnlp.data.meta_yaml_generator:cli"
35 | chemnlp-augment-meta = "chemnlp.data.meta_yaml_augmenter:cli"
36 | chemnlp-sample = "chemnlp.data.sampler_cli:cli"
37 | chemnlp-add-random-split-column = "chemnlp.data.utils:add_random_split_column_cli"
38 | chemnlp-concatenate-jsonl = "chemnlp.data.utils:concatenate_jsonl_files_cli"
39 | 
40 | [tool.setuptools_scm]
41 | version_scheme = "post-release"
42 | 


--------------------------------------------------------------------------------
/src/chemnlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/src/chemnlp/__init__.py


--------------------------------------------------------------------------------
/src/chemnlp/data/hf_datasets.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | 
 3 | 
 4 | def boolq(tokenizer):
 5 |     dataset = load_dataset("boolq")
 6 | 
 7 |     def _tokenize_function(example, tokenizer):
 8 |         all_text = f"Passage:\n{example['passage']} \nQuestion:\n{example['question']}\nAnswer:\n{example['answer']}"
 9 |         return tokenizer(all_text)
10 | 
11 |     tokenized = dataset.map(
12 |         _tokenize_function,
13 |         fn_kwargs={"tokenizer": tokenizer},
14 |         remove_columns=["question", "answer", "passage"],
15 |     )
16 | 
17 |     return tokenized["train"], tokenized["validation"]
18 | 
19 | 
20 | def rotten_tomatoes(tokenizer):
21 |     dataset = load_dataset("rotten_tomatoes")
22 | 
23 |     def _tokenize_function(example, tokenizer):
24 |         return tokenizer(example["text"])
25 | 
26 |     tokenized = dataset.map(
27 |         _tokenize_function,
28 |         fn_kwargs={"tokenizer": tokenizer},
29 |         remove_columns=["text", "label"],
30 |     )
31 | 
32 |     return tokenized["train"], tokenized["validation"]
33 | 


--------------------------------------------------------------------------------
/src/chemnlp/data/meta.yaml:
--------------------------------------------------------------------------------
 1 | bibtex:
 2 |   - "@article{martins2023,\nauthor = {Martins, John and Doe, Jane and Smith, Alice},\ntitle = {Study on Blood-Brain Barrier Penetration of Various Drugs},\njournal = {Journal of Pharmacology},\nvolume = {12},\nnumber = {3},\npages = {123-134},\nyear = {2023},\ndoi = {10.1234/jpharm.2023.56789}}"
 3 | description: Describing the ability of different drugs to penetrate the blood-brain barrier.
 4 | identifiers:
 5 |   - description: Simplified Molecular Input Line Entry System
 6 |     id: SMILES
 7 |     type: SMILES
 8 |   - description: Name of the compound
 9 |     id: compound_name
10 |     names:
11 |       - noun: compound name
12 |     type: Other
13 | license: CC BY 4.0
14 | links:
15 |   - description: corresponding publication
16 |     url: https://example.com/publication
17 |   - description: data source
18 |     url: https://example.com/data_source
19 | name: blood_brain_barrier_martins_et_al
20 | num_points: 2030
21 | targets:
22 |   - description: Indicates whether the compound can penetrate the blood-brain barrier (1 for yes, 0 for no)
23 |     id: penetrate_BBB
24 |     names:
25 |       - noun: blood-brain barrier penetration
26 |     type: integer
27 | templates:
28 |   - The compound {compound_name__names__noun} with SMILES {SMILES#} can {#penetrate|not penetrate!} the blood-brain barrier.
29 |   - The compound {compound_name__names__noun} with SMILES {SMILES#} is in the {split#} set.
30 |   - "Question: Which of the following compounds can penetrate the blood-brain barrier?\nOptions: {%multiple_choice_enum%4%aA1}\n{compound_name%}\nAnswer: {%multiple_choice_result}"
31 |   - The compound with SMILES {SMILES#} can penetrate the blood-brain barrier:<EOI>{penetrate_BBB#}
32 | 


--------------------------------------------------------------------------------
/src/chemnlp/data/random_variable.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from functools import partial
 3 | from typing import Callable, Optional
 4 | 
 5 | 
 6 | def unwrap_list_length_1(list_input: list):
 7 |     """Unwraps lists of length 1 and returns the first = single element."""
 8 |     if isinstance(list_input, list):
 9 |         assert len(list_input) == 1
10 |         return list_input[0]
11 |     else:
12 |         raise NotImplementedError()
13 | 
14 | 
15 | class RandomVariable:
16 |     """Simple random variable class that takes in a name, data, and a sampler.
17 |     The sampler needs to return a single element."""
18 | 
19 |     def __init__(self, name: str, data: list, sampler: Optional[Callable] = None):
20 |         self.name = name
21 |         self.data = data
22 |         self.sampler = partial(random.sample, k=1) if sampler is None else sampler
23 | 
24 |     def __repr__(self):
25 |         return f"RandomVariable: {self.name}, {self.data}, {self.sampler}"
26 | 
27 |     def __call__(self) -> str:
28 |         """Carries out sampling and returns a single element."""
29 |         return unwrap_list_length_1(self.sampler(self.data))
30 | 


--------------------------------------------------------------------------------
/src/chemnlp/data/reprs.py:
--------------------------------------------------------------------------------
 1 | import backoff
 2 | import deepsmiles
 3 | import pubchempy as pcp
 4 | import requests
 5 | import selfies
 6 | from rdkit import Chem
 7 | 
 8 | 
 9 | def smiles_to_selfies(smiles: str) -> str:
10 |     """
11 |     Takes a SMILES and return the selfies encoding.
12 |     """
13 | 
14 |     return selfies.encoder(smiles)
15 | 
16 | 
17 | def smiles_to_deepsmiles(smiles: str) -> str:
18 |     """
19 |     Takes a SMILES and return the DeepSMILES encoding.
20 |     """
21 |     converter = deepsmiles.Converter(rings=True, branches=True)
22 |     return converter.encode(smiles)
23 | 
24 | 
25 | def smiles_to_canoncial(smiles: str) -> str:
26 |     """
27 |     Takes a SMILES and return the canoncial SMILES.
28 |     """
29 |     mol = Chem.MolFromSmiles(smiles)
30 |     return Chem.MolToSmiles(mol)
31 | 
32 | 
33 | def smiles_to_inchi(smiles: str) -> str:
34 |     """
35 |     Takes a SMILES and return the InChI.
36 |     """
37 |     mol = Chem.MolFromSmiles(smiles)
38 |     return Chem.MolToInchi(mol)
39 | 
40 | 
41 | def smiles_to_safe(smiles: str) -> str:
42 |     """
43 |     Takes a SMILES and return the SAFE.
44 |     """
45 |     import safe
46 | 
47 |     return safe.encode(smiles, seed=42, canonical=True, randomize=False)
48 | 
49 | 
50 | CACTUS = "https://cactus.nci.nih.gov/chemical/structure/{0}/{1}"
51 | 
52 | 
53 | @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
54 | def cactus_request_w_backoff(smiles, rep="iupac_name"):
55 |     url = CACTUS.format(smiles, rep)
56 |     response = requests.get(url, allow_redirects=True, timeout=10)
57 |     response.raise_for_status()
58 |     name = response.text
59 |     if "html" in name:
60 |         return None
61 |     return name
62 | 
63 | 
64 | def smiles_to_iupac_name(smiles: str) -> str:
65 |     """Use the chemical name resolver https://cactus.nci.nih.gov/chemical/structure.
66 |     If this does not work, use pubchem.
67 |     """
68 |     try:
69 |         name = cactus_request_w_backoff(smiles, rep="iupac_name")
70 |         if name is None:
71 |             raise Exception
72 |         return name
73 |     except Exception:
74 |         try:
75 |             compound = pcp.get_compounds(smiles, "smiles")
76 |             return compound[0].iupac_name
77 |         except Exception:
78 |             return None
79 | 


--------------------------------------------------------------------------------
/src/chemnlp/data_val/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/src/chemnlp/data_val/__init__.py


--------------------------------------------------------------------------------
/src/chemnlp/data_val/validate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | 
 4 | import fire
 5 | 
 6 | from .model import Dataset
 7 | 
 8 | 
 9 | def validate_meta(file):
10 |     """Validate a metadata file."""
11 | 
12 |     try:
13 |         with open(file, "r") as f:
14 |             _model = Dataset.parse_raw(f.read())  # noqa:  F841
15 |     except Exception as e:
16 |         raise ValueError(f"Error parsing {file}: {e}")
17 | 
18 | 
19 | def validate_folder(folder):
20 |     """Validate all metadata files in a folder."""
21 | 
22 |     files = glob(os.path.join(folder, "**", "meta.yaml"))
23 |     for file in files:
24 |         validate_meta(file)
25 |     return True
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     fire.Fire(validate_folder)
30 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/tests/__init__.py


--------------------------------------------------------------------------------
/tests/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenBioML/chemnlp/00d2dd1f1a4e8b23fd4b389b38b2f2ffe5b29ad5/tests/data/__init__.py


--------------------------------------------------------------------------------
/tests/test_ner.py:
--------------------------------------------------------------------------------
 1 | from chemnlp.data.ner import group_tokens_by_labels, punctuation_joiner
 2 | 
 3 | 
 4 | def test_tokens_by_label():
 5 |     tokens = ["a", "b", "c", "d", "e", "f"]
 6 | 
 7 |     labels = [0, 1, 1, 0, 1, 0]
 8 |     grouped_tokens = group_tokens_by_labels(tokens, labels)
 9 |     assert set(grouped_tokens) == set(["b", "c", "e"])
10 | 
11 |     labels = [0, 1, 2, 0, 1, 0]
12 |     grouped_tokens = group_tokens_by_labels(tokens, labels)
13 |     assert set(grouped_tokens) == set(["b c", "e"])
14 | 
15 | 
16 | def test_join_punctuation():
17 |     token_list = [
18 |         "This",
19 |         "is",
20 |         "a",
21 |         "list",
22 |         "of",
23 |         "tokens",
24 |         "with",
25 |         "2",
26 |         ".",
27 |         "5",
28 |         ",",
29 |         "and",
30 |         "3",
31 |         "numbers",
32 |         "intact",
33 |         "semi",
34 |         "-",
35 |         "colon",
36 |         "separated",
37 |         "words",
38 |         "with",
39 |         "decimal",
40 |         "numbers",
41 |         "split",
42 |         "at",
43 |         "dots",
44 |         ".",
45 |         "This",
46 |         "is",
47 |         "a",
48 |         "comma",
49 |         ",",
50 |         "and",
51 |         "a",
52 |         "dot",
53 |         "(",
54 |         "test",
55 |         ")",
56 |         ".",
57 |     ]
58 |     sentence = punctuation_joiner(token_list)
59 |     print(sentence)
60 |     assert (
61 |         sentence
62 |         == "This is a list of tokens with 2.5, and 3 numbers intact semi-colon separated words with decimal numbers split at dots. This is a comma, and a dot (test)."  # noqa
63 |     )
64 | 


--------------------------------------------------------------------------------
/tests/test_reprs.py:
--------------------------------------------------------------------------------
 1 | from chemnlp.data.reprs import smiles_to_iupac_name
 2 | 
 3 | # not used at the moment
 4 | # def test_smiles_to_safe():
 5 | #     safe = smiles_to_safe("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
 6 | #     # equivalent, only rotations, it is not completely deterministic
 7 | #     assert (
 8 | #         safe == "c12ccc3cc1.C3(C)C(=O)O.CC(C)C2"
 9 | #         or safe == "c13ccc2cc1.C2(C)C(=O)O.CC(C)C3"
10 | #     )
11 | 
12 | 
13 | def test_smiles_to_iupac_name():
14 |     iupac_name = smiles_to_iupac_name("CC(Cc1ccc(cc1)C(C(=O)O)C)C")
15 |     assert iupac_name == "2-[4-(2-methylpropyl)phenyl]propanoic acid"
16 | 


--------------------------------------------------------------------------------