├── images └── ukbb_srmed.png ├── sh ├── chembl │ └── get_chembl_atc_classification_data_sqlite.sh ├── 04_make_bnf_phewas_phenotypes.sh ├── pheno │ ├── medication_pheno_transpose.sh │ ├── medication_pheno_extract.sh │ ├── normalise_bnf_codes.sh │ ├── cut_pheno_cols.sh │ ├── transpose_pheno_data.sh │ ├── make_ukb_phenotypes_bnf.sh │ ├── make_ukb_phenotypes_bnf_prep.sh │ ├── make_ukb_phenotypes_atc_level3.sh │ ├── make_ukb_phenotypes_atc_level2.sh │ └── make_ukb_phenotypes_atc_prep.sh ├── 04_make_atc_phewas_phenotypes.sh ├── merge_chembl_synonyms_bnf.sh ├── merge_chembl_synonyms_atc.sh ├── merge_chembl_synonyms_ukbb.sh ├── atc_words_synonyms.sh ├── 03_get_ukbb_srmed_data_atc.sh ├── 03_get_ukbb_srmed_data_bnf.sh ├── get_molecule_synonyms_sqlite.sh ├── 01_bnf_prepare_sqlite.sh ├── bnf_words_synonyms.sh ├── preprocess_atc_data.sh ├── preprocess_bnf_data.sh ├── 01_atc_prepare_sqlite.sh ├── 02_bnf_match.sh ├── 02_atc_match.sh ├── bnf_post_process_match_data.sh └── atc_post_process_match_data.sh ├── py ├── bnf_parse.py ├── atc_parse.py ├── pheno │ ├── transpose_pheno_data.py │ ├── normalise_bnf_codes.py │ ├── generate_bnf_medication_annotations.py │ ├── assign_codes_to_participant_data.py │ ├── generate_medication_phenotypes.py │ └── cut_main_csv_file.py ├── list_excl_words.py ├── chembl │ ├── get_atc_data_with_molregno_sqlite.py │ ├── dump_sqlite_table_data.py │ ├── generate_atc_medication_annotations_sqlite.py │ └── generate_atc_medication_annotations.py ├── generate_syn_dictionary.py ├── parse_chembl_synonyms.py ├── append_ukb_counts.py ├── merge_chembl_synonyms.py ├── code_data_match.py └── datahelper.py ├── env └── common_tplt ├── CODE_OF_CONDUCT.md ├── README.md ├── LICENSE └── data └── atc_unmatched_list.csv /images/ukbb_srmed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PhilAppleby/ukbb-srmed/HEAD/images/ukbb_srmed.png -------------------------------------------------------------------------------- /sh/chembl/get_chembl_atc_classification_data_sqlite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Join atc_classification and atc_molecule_classication 4 | # 5 | python ${PYDIR}/chembl/get_atc_data_with_molregno_sqlite.py > \ 6 | ${CDATADIR}/atc_classification_molregno.tsv 7 | -------------------------------------------------------------------------------- /sh/04_make_bnf_phewas_phenotypes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Generate UKB medication data phenotypes 5 | # 6 | echo "Step 11 - Generate BNF phenotypes" 7 | time ${SHDIR}/pheno/make_ukb_phenotypes_bnf.sh 8 | echo "Step 12 - Generate BNF phenotypes, version 2" 9 | time ${SHDIR}/pheno/normalise_bnf_codes.sh 10 | -------------------------------------------------------------------------------- /sh/pheno/medication_pheno_transpose.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Wrap the pheno/transpose_pheno_data.sh script, supplying parameters 5 | # for medication data 6 | # 7 | echo "Step 9 - Transpose medication phenotype data, eliminating empty cells" 8 | time ${SHDIR}/pheno/transpose_pheno_data.sh reported_medication 20003 9 | -------------------------------------------------------------------------------- /sh/04_make_atc_phewas_phenotypes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Generate UKB medication data phenotypes 5 | # 6 | echo "Step 11a - Generate level2 phenotypes" 7 | time ${SHDIR}/pheno/make_ukb_phenotypes_atc_level2.sh 8 | echo "Step 11b - Generate level3 phenotypes" 9 | time ${SHDIR}/pheno/make_ukb_phenotypes_atc_level3.sh 10 | -------------------------------------------------------------------------------- /sh/merge_chembl_synonyms_bnf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Merge bulk BNF data (ownership uncertain, possibly HIC) with 4 | # synonyms from the CHEMBL database 5 | # 6 | cat ${BNFDATADIR}/bnf_combined.csv | \ 7 | python ${PYDIR}/merge_chembl_synonyms.py --synfile=${CDATADIR}/syn_dict_all.txt > \ 8 | ${BNFDATADIR}/bnf_combined_synonyms.csv 9 | -------------------------------------------------------------------------------- /sh/pheno/medication_pheno_extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Wrap the pheno/cut_pheno_cols.sh script, supplying parameters 5 | # for medication data - US only as that's where the main 6 | # UKB csv file resides 7 | # 8 | echo "Step 8 - Extract medication phenotype from the main UKBB file" 9 | time ${SHDIR}/pheno/cut_pheno_cols.sh reported_medication 20003 10 | -------------------------------------------------------------------------------- /sh/merge_chembl_synonyms_atc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Merge (attach) chembl synonyms with atc classification data - both originally 5 | # extracted from the CHEMBL database 6 | # 7 | # 8 | cat ${ATCDATADIR}/atc_who_desc.csv | \ 9 | python ${PYDIR}/merge_chembl_synonyms.py --synfile=${CDATADIR}/syn_dict_all.txt > \ 10 | ${ATCDATADIR}/atc_who_desc_synonyms.csv 11 | -------------------------------------------------------------------------------- /sh/pheno/normalise_bnf_codes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | echo '.1 cat BNF coded for conversion' ${UKBPDIR} 4 | cat ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv \ 5 | | python ${PYDIR}/pheno/normalise_bnf_codes.py \ 6 | > ${UKBPDIR}/bnf_chapter_section_subsection_counts.csv 7 | #------------------------------------------------------------------------------------------------ 8 | echo 'END' 9 | -------------------------------------------------------------------------------- /sh/merge_chembl_synonyms_ukbb.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Merge chembl synonyms with UKBB medication coding data: chembl synoyms 5 | # from the CHEMBL database, UKBB coding data from the UK Biobank data showcase 6 | # 7 | 8 | cat ${UDATADIR}/medication_coding.csv | \ 9 | python ${PYDIR}/merge_chembl_synonyms.py --synfile=${CDATADIR}/syn_dict_all.txt > \ 10 | ${UDATADIR}/medication_coding_synonyms.csv 11 | -------------------------------------------------------------------------------- /sh/atc_words_synonyms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # CHEMBL synonyms have been attached to both ATC classification data 5 | # and UKB coding data 6 | # This is the main match process - the result file contains code matches 7 | # 8 | # 9 | cat ${UDATADIR}/medication_coding_synonyms.csv | \ 10 | python ${PYDIR}/code_data_match.py --clsfile=${ATCDATADIR}/atc_who_desc_synonyms.csv > \ 11 | ${ATCDATADIR}/results/atc_res.csv 12 | -------------------------------------------------------------------------------- /sh/03_get_ukbb_srmed_data_atc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Extract UKBB SR medication data 4 | # 5 | echo "Step 8 - Extract phenotype information from the UKBB main phenotype file" 6 | time ${SHDIR}/pheno/medication_pheno_extract.sh 7 | echo "Step 9 - Transpose extracted phenotype data" 8 | time ${SHDIR}/pheno/medication_pheno_transpose.sh 9 | echo "Step 10 - Prepare ATC data for phenotype generation" 10 | time ${SHDIR}/pheno/make_ukb_phenotypes_atc_prep.sh 11 | -------------------------------------------------------------------------------- /sh/03_get_ukbb_srmed_data_bnf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Extract UKBB SR medication data 4 | # 5 | echo "Step 8 - Extract phenotype information from the UKBB main phenotype file" 6 | time ${SHDIR}/pheno/medication_pheno_extract.sh 7 | echo "Step 9 - Transpose extracted phenotype data" 8 | time ${SHDIR}/pheno/medication_pheno_transpose.sh 9 | echo "Step 10 - Prepare BNF data for phenotype generation" 10 | time ${SHDIR}/pheno/make_ukb_phenotypes_bnf_prep.sh 11 | -------------------------------------------------------------------------------- /sh/get_molecule_synonyms_sqlite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Extract synonyms from the chembl molecule_synonyms table and "flatten" them 4 | # into one record prior to synonym dictionary generation, which expands 5 | # to one record per synonym in the data file output by the pipeline 6 | # 7 | python ${PYDIR}/chembl/dump_sqlite_table_data.py --tablename=molecule_synonyms | \ 8 | sort -k1,1 -n | \ 9 | python ${PYDIR}/parse_chembl_synonyms.py | \ 10 | python ${PYDIR}/generate_syn_dictionary.py > ${CDATADIR}/syn_dict_all.txt 11 | -------------------------------------------------------------------------------- /sh/pheno/cut_pheno_cols.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #source ${UKBPROJROOT}/env/common 3 | # Only runnable against the single read-only copy of the UKB Phenotype data 4 | # 5 | # must supply arg1, arg2 only (descriptive name and column prefix) 6 | if [[ $# -ne 2 ]] ; then 7 | echo 'must supply arg1, arg2 only (descriptive name and column prefix)' 8 | exit 1 9 | fi 10 | echo ${UKBBPHENODIR}/${UKBBPHENOFILE} 11 | echo "Cut cols: write to ${UKBPDIR} (ukb_${2}_${1}.csv)" 12 | python ${PYDIR}/pheno/cut_main_csv_file.py --csvfile=${UKBBPHENODIR}/${UKBBPHENOFILE} --colprefs=${2} > ${UKBPDIR}/ukb_${2}_${1}.csv 13 | -------------------------------------------------------------------------------- /sh/pheno/transpose_pheno_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | #source ${UKBPROJROOT}/env/common 3 | # Take a multi column file extraced for a phenotype from 4 | # the main UKB csv file and output one record per participant 5 | # per phenotype code 6 | # 7 | # must supply arg1, arg2 only (descriptive name and column prefix) 8 | if [[ $# -ne 2 ]] ; then 9 | echo 'must supply arg1, arg2 only (descriptive name and column prefix)' 10 | exit 1 11 | fi 12 | echo "Transp: write to ${UKBPDIR}" 13 | 14 | cat ${UKBPDIR}/ukb_${2}_${1}.csv | \ 15 | python ${PYDIR}/pheno/transpose_pheno_data.py > ${UKBPDIR}/ukb_${2}_${1}_n.csv 16 | -------------------------------------------------------------------------------- /sh/01_bnf_prepare_sqlite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Data preparation- get BNF classification data and generate synonyms 5 | # 6 | # NOTE: No step 1 here, BNF files must be supplied - this depends 7 | # on local ownership of BNF data 8 | # 9 | echo "Step 2 - Preprocess BNF classification data" 10 | time ${SHDIR}/preprocess_bnf_data.sh 11 | echo "Step 3 - Get and reorganise CHEMBL molecule synonyms" 12 | time ${SHDIR}/get_molecule_synonyms_sqlite.sh 13 | 14 | # Main file artefacts at the end of each step of this group of steps: 15 | 16 | # 2) ${BNFDATADIR}/bnf_combined.csv 17 | # 18 | # 3) ${CDATADIR}/syn_dict_all.txt 19 | -------------------------------------------------------------------------------- /sh/bnf_words_synonyms.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Match words in medication_coding.tsv (The UKBB site has tis as coding4.tsv - download link 4 | # was http://biobank.ctsu.ox.ac.uk/showcase/coding.cgi?id=4 as at 20171019 5 | # 6 | # bnf_combined.csv is a mashup of words from both the chemical names file and from HIC's bnf file 7 | # CAN WE USE THIS 8 | # Advantages are that it has proprietary names and chem name 9 | # 10 | cat ${UDATADIR}/medication_coding_synonyms.csv | python ${PYDIR}/code_data_match.py \ 11 | --clsfile=${BNFDATADIR}/bnf_combined_synonyms.csv \ 12 | --multioutput=N \ 13 | > ${BNFDATADIR}/results/bnf_res.csv 14 | -------------------------------------------------------------------------------- /sh/preprocess_atc_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Preprocess the ATC classification data to cut the relevant columns 5 | # Executed twice to capture both molregno and description 6 | # Then to trim the ATC code to the required length (level3) 7 | # 8 | awk -F '\t' '{print $2 "\t" $3}' ${CDATADIR}/atc_classification_molregno.tsv | \ 9 | grep -v None > ${CDATADIR}/atc_who_desc_fullcode.tsv 10 | awk -F '\t' '{print $2 "\t" $1}' ${CDATADIR}/atc_classification_molregno.tsv >> \ 11 | ${CDATADIR}/atc_who_desc_fullcode.tsv 12 | cat ${CDATADIR}/atc_who_desc_fullcode.tsv | \ 13 | python ${PYDIR}/atc_parse.py --codelen=4 > ${ATCDATADIR}/atc_who_desc.csv 14 | -------------------------------------------------------------------------------- /sh/preprocess_bnf_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Preprocess BNF files to cut the relevant columns, normalise 4 | # coding and convert all text to lower case 5 | # 6 | # Local data Code + long description 7 | cut -f 1,3 -d ',' ${BNFDATADIR}/bnf_data_formatted.csv > ${BNFDATADIR}/bnf_data_1.csv 8 | # Local data Code + approved_name 9 | cut -f 1,4 -d ',' ${BNFDATADIR}/bnf_data_formatted.csv | sed '1d' > ${BNFDATADIR}/bnf_data_2.csv 10 | 11 | # Cat the 2 result files from above, format the bnf code and convert all text to lower case 12 | cat ${BNFDATADIR}/bnf_data_1.csv ${BNFDATADIR}/bnf_data_2.csv | python ${PYDIR}/bnf_parse.py > ${BNFDATADIR}/bnf_combined.csv 13 | -------------------------------------------------------------------------------- /py/bnf_parse.py: -------------------------------------------------------------------------------- 1 | # 2 | # Transform BNF code to formatted version 3 | # 4 | import time 5 | import datetime 6 | # import re 7 | import os, sys 8 | from datahelper import Datahelper 9 | 10 | def main(): 11 | count = 0 12 | dh = Datahelper() 13 | hdr = sys.stdin.readline().strip() 14 | print(hdr) 15 | 16 | for line in sys.stdin: 17 | count += 1 18 | data = line.strip().split(',') 19 | data[0] = dh.format_digit_code(data[0], 3) 20 | data[1] = data[1].lower() 21 | print(','.join(data)) 22 | 23 | return count 24 | 25 | # execution flow starts here 26 | # 27 | start_time = time.time() 28 | 29 | count = main() 30 | #print "END:", time.time() - start_time, "seconds", count 31 | 32 | -------------------------------------------------------------------------------- /sh/01_atc_prepare_sqlite.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Data preparation- get ATC classification data and generate synonyms 5 | # 6 | echo "Step 1 - Get ATC classification data from the CHEMBL Db" 7 | time ${SHDIR}/chembl/get_chembl_atc_classification_data_sqlite.sh 8 | echo "Step 2 - Preprocess ATC classification data" 9 | time ${SHDIR}/preprocess_atc_data.sh 10 | echo "Step 3 - Get and reorganise CHEMBL molecule synonyms" 11 | time ${SHDIR}/get_molecule_synonyms_sqlite.sh 12 | 13 | # Main file artefacts at the end of each step of this group of steps: 14 | 15 | # 1) ${CDATADIR}/atc_classification_molregno.tsv 16 | # 17 | # 2) ${ATCDATADIR}/atc_who_desc.csv 18 | # 19 | # 3) ${CDATADIR}/syn_dict_all.txt 20 | -------------------------------------------------------------------------------- /py/atc_parse.py: -------------------------------------------------------------------------------- 1 | # 2 | # Trim ATC code to formatted version 3 | # Calls d function in the datahelper class 4 | # 5 | import time 6 | import datetime 7 | import os, sys 8 | from datahelper import Datahelper 9 | from optparse import OptionParser 10 | 11 | def main(options): 12 | dh = Datahelper() 13 | 14 | for line in sys.stdin: 15 | data = line.strip().split('\t') 16 | data[0] = dh.format_atc_code(data[0], int(options.codelen)) 17 | data[1] = dh.get_normalised_phrase(data[1]).lower() 18 | print(','.join(data)) 19 | # End main() 20 | 21 | # execution flow starts here 22 | # 23 | start_time = time.time() 24 | 25 | parser = OptionParser() 26 | parser.add_option("-l", "--codelen", dest="codelen", 27 | help="ATC code length", metavar="INT") 28 | (options, args) = parser.parse_args() 29 | 30 | main(options) 31 | 32 | -------------------------------------------------------------------------------- /py/pheno/transpose_pheno_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # print records for all participants for all codes found in the input (eliminates blank records) 3 | # 4 | # NOTE: Assumes relevant data starts in the second column (1) - (eid) is the first column (0). 5 | # 6 | import time 7 | import datetime 8 | import re 9 | import os, sys 10 | import random 11 | import json 12 | 13 | def main(): 14 | 15 | codes = [] 16 | count = 0 17 | hdr = sys.stdin.readline() 18 | print("eid,code") 19 | 20 | for line in sys.stdin: 21 | data = line.strip().split(',') 22 | for elem in data[1:]: 23 | if elem != "": 24 | print("{0},{1}".format(data[0], elem)) 25 | 26 | return count 27 | 28 | 29 | # execution flow starts here 30 | # 31 | start_time = time.time() 32 | 33 | count = main() 34 | #print "END:", time.time() - start_time, "seconds", count 35 | 36 | -------------------------------------------------------------------------------- /env/common_tplt: -------------------------------------------------------------------------------- 1 | export UKBBDATADIR= 2 | export ATCDATADIR=${PROJDATA}/atc 3 | export BNFDATADIR=${PROJDATA}/bnf 4 | export BNFCODEDIR=${PROJDATA}/GoDARTS2/lookups 5 | export UKBPDIR=${PROJDATA}/ukb_pheno 6 | export UKBBPHENOFILE= 7 | export UDATADIR=${PROJDATA}/ukb 8 | export CDATADIR=${PROJDATA}/chembl 9 | export PYDIR=${UKBPROJROOT}/py 10 | export PYTHONPATH=${PYDIR}:${PYTHONPATH} 11 | export SHDIR=${UKBPROJROOT}/sh 12 | export LOCBINDIR=${HOME}/local/bin 13 | # Note that this is for sqlite3 access to ChEMBL data 14 | export CHFILE=${PROJDATA}/chembl_23/chembl_23.db 15 | # Fill in the following if using MySQL (other DBMS's may require different parameters) 16 | export CHHOST= 17 | export CHPORT= 18 | export CHDB=chembldb23 19 | export CHUSER= 20 | export CHPWD= 21 | -------------------------------------------------------------------------------- /py/pheno/normalise_bnf_codes.py: -------------------------------------------------------------------------------- 1 | # Load BNF codes and match them in the chapter.section count file 2 | # 3 | import time 4 | import datetime 5 | import re 6 | import os, sys 7 | 8 | start_time = time.time() 9 | 10 | def main(): 11 | 12 | print("id,icd9,count") 13 | bnf_col = 2 14 | 15 | for line in sys.stdin: 16 | data = line.strip().split(",") 17 | bnf_code = "" 18 | bnf_array = data[bnf_col].split(".") 19 | if bnf_array[0] == "DU": 20 | continue 21 | if bnf_array[0] == "NULL": 22 | continue 23 | if len(bnf_array) == 1: 24 | bnf_code = "{0:02d}".format(int(bnf_array[0])) 25 | elif len(bnf_array) == 2: 26 | bnf_code = "{0:02d}{1:02d}".format(int(bnf_array[0]), int(bnf_array[1])) 27 | elif len(bnf_array) == 3: 28 | bnf_code = "{0:02d}{1:02d}{2:02d}".format(int(bnf_array[0]), int(bnf_array[1]), int(bnf_array[2])) 29 | print("{},{},{}".format(data[0], bnf_code, 10)) 30 | return 31 | 32 | 33 | # execution flow starts here 34 | # 35 | main() 36 | #print "END:", time.time() - start_time, "seconds", rec_count 37 | 38 | -------------------------------------------------------------------------------- /py/list_excl_words.py: -------------------------------------------------------------------------------- 1 | # 2 | import time 3 | import datetime 4 | import re 5 | import string 6 | import os, sys 7 | from optparse import OptionParser 8 | from datahelper import Datahelper 9 | 10 | def main(): 11 | 12 | try: 13 | dh = Datahelper() 14 | except IOError as e: 15 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 16 | print("I/O error:", sys.exc_info()) 17 | exit() 18 | except TypeError as e: 19 | print("Missing arguments ", e) 20 | exit() 21 | except: 22 | #print "Unexpected error:", sys.exc_info()[0] 23 | print("Unexpected error:", sys.exc_info()) 24 | exit() 25 | 26 | ewords = dh.get_excluded_words() 27 | 28 | count = 0 29 | outarray = [] 30 | for wd in ewords: 31 | count += 1 32 | outarray.append(wd) 33 | if count % 4 == 0: 34 | print(",".join(outarray)) 35 | outarray = [] 36 | if len(outarray) > 0: 37 | print(",".join(outarray)) 38 | #print count 39 | 40 | return 41 | 42 | 43 | # execution flow starts here 44 | # 45 | main() 46 | #print "END:", time.time() - start_time, "seconds", count, match_count, miss_count 47 | 48 | -------------------------------------------------------------------------------- /sh/02_bnf_match.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # The Match steps: 4 | # BNF and UKBB data synonym merging, 5 | # 6 | # All BNF codes and descriptions: 7 | # UKBB side - has synonyms 8 | # BNF side - has synonyms 9 | # 10 | # 11 | echo "Step 4 - Merge CHEMBL molecule synonyms and BNF classification data" 12 | time ${SHDIR}/merge_chembl_synonyms_bnf.sh 13 | echo "Step 5 - Merge CHEMBL molecule synonyms and UKB self-reported medication data" 14 | time ${SHDIR}/merge_chembl_synonyms_ukbb.sh 15 | echo "Step 6 - The main matching step - attempt to assign BNF codes to UKBB data" 16 | time ${SHDIR}/bnf_words_synonyms.sh 17 | echo "Step 7 - Post process match / mismatch data to format for manual intervention and phenotype generation" 18 | time ${SHDIR}/bnf_post_process_match_data.sh 19 | 20 | # Main output files at the end of each step: 21 | 22 | # 4) ${BNFDATADIR}/bnf_combined_synonyms.csv 23 | 24 | # 5) ${UDATADIR}/medication_coding_synonyms.csv 25 | 26 | # 6) ${BNFDATADIR}/results/bnf_res.csv 27 | 28 | # 7) ${BNFDATADIR}/results/bnf_matched.csv and ${BNFDATADIR}/results/bnf_missing.csv (which is then used to assign manual matches) 29 | -------------------------------------------------------------------------------- /sh/02_atc_match.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # The Match steps: 4 | # ATC and UKBB data synonym merging, 5 | # 6 | # All ATC codes and descriptions: 7 | # UKBB side - has synonyms 8 | # ATC side - has synonyms 9 | # 10 | # 11 | echo "Step 4 - Merge CHEMBL molecule synonyms and ATC classification data" 12 | time ${SHDIR}/merge_chembl_synonyms_atc.sh 13 | echo "Step 5 - Merge CHEMBL molecule synonyms and UKB self-reported medication data" 14 | time ${SHDIR}/merge_chembl_synonyms_ukbb.sh 15 | echo "Step 6 - The main matching step - attempt to assign ATC level3 codes to UKBB data" 16 | time ${SHDIR}/atc_words_synonyms.sh 17 | echo "Step 7 - Post process match / mismatch data to format for manual intervention and phenotype generation" 18 | time ${SHDIR}/atc_post_process_match_data.sh 19 | 20 | # Main output files at the end of each step: 21 | 22 | # 4) ${ATCDATADIR}/atc_who_desc_synonyms.csv 23 | 24 | # 5) ${UDATADIR}/medication_coding_synonyms.csv 25 | 26 | # 6) ${ATCDATADIR}/results/atc_res.csv 27 | 28 | # 7) ${ATCDATADIR}/results/atc_matched.csv and ${ATCDATADIR}/results/atc_missing.csv (which is then used to assign manual matches) 29 | -------------------------------------------------------------------------------- /py/chembl/get_atc_data_with_molregno_sqlite.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sqlite3 3 | import os, sys 4 | from optparse import OptionParser 5 | # 6 | # Access the publicly available CHEMBL database (local copy) to 7 | # obtain ATC (WHO) classification data 8 | # 9 | # 10 | def main(): 11 | """ 12 | Get atc data for use in classifying UKBB coded medication data 13 | Requires a join of the CHEMBL atc_classification and molecule_atc_classification tables 14 | """ 15 | try: 16 | chembl = sqlite3.connect(os.environ["CHFILE"]) 17 | chembl.text_factory = str 18 | except: 19 | print("Unexpected error:", sys.exc_info()) 20 | exit() 21 | 22 | query = """select atc.who_name, atc.level5, matc.molregno, atc.level4_description 23 | from atc_classification atc 24 | LEFT JOIN molecule_atc_classification matc 25 | ON atc.level5 = matc.level5""" 26 | 27 | cursor = chembl.cursor() 28 | cursor.execute(query) 29 | for row in cursor: 30 | print('\t'.join([str(elem) for elem in row])) 31 | 32 | chembl.close() 33 | # main() ends 34 | 35 | 36 | # execution flow starts here 37 | # 38 | start_time = time.time() 39 | 40 | main() 41 | -------------------------------------------------------------------------------- /sh/pheno/make_ukb_phenotypes_bnf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | echo '.1 sort BNF coded data to' ${UKBPDIR} 4 | cat ${UKBPDIR}/ukb_20003_with_bnf_codes_matched.csv | \ 5 | sort -u > ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv 6 | 7 | # step 2 get the list of possible phenotypes 8 | echo '.2 extract list of possible phenotype to' ${UKBPDIR} 9 | cut -f 3 -d ',' ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv | \ 10 | sort -u > ${UKBPDIR}/ukb_possible_med_phenotypes_bnf.csv 11 | 12 | # step 3 generate phenotype annotations, get data from the BNF description file 13 | echo '.3 Get data from bnf desc file, write to' ${UKBPDIR} 14 | cat ${UKBPDIR}/ukb_possible_med_phenotypes_bnf.csv | \ 15 | python ${PYDIR}/pheno/generate_bnf_medication_annotations.py \ 16 | --bnfcodes=${BNFCODEDIR}/bnf_listing.txt > \ 17 | ${UKBPDIR}/Anno_medications_BIN_bnf.csv 18 | 19 | # step 4 generate phenotypes 20 | echo '.4 write PheWAS phenos to' ${UKBPDIR} 21 | cat ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv | \ 22 | python ${PYDIR}/pheno/generate_medication_phenotypes.py \ 23 | --pfile=${UKBPDIR}/Anno_medications_BIN_bnf.csv > \ 24 | ${UKBPDIR}/med_phenotypes_bnf.tsv 25 | #------------------------------------------------------------------------------------------------ 26 | echo 'END' 27 | -------------------------------------------------------------------------------- /py/pheno/generate_bnf_medication_annotations.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os, sys 3 | from datahelper import Datahelper 4 | from optparse import OptionParser 5 | 6 | def load_code_list(fh): 7 | lookup = {} 8 | for line in fh: 9 | codedata = line.strip().split("|") 10 | lookup[codedata[0]] = codedata[1] 11 | 12 | return lookup 13 | 14 | def main(options): 15 | """ 16 | """ 17 | 18 | dh = Datahelper() 19 | try: 20 | fh = open(options.bnfcodes, "r") 21 | code_lookup = load_code_list(fh) 22 | except: 23 | print("Unexpected error:", sys.exc_info()) 24 | exit() 25 | 26 | print("pheno,PHENOTYPE,Category,type") 27 | 28 | count = 0 29 | for line in sys.stdin: 30 | data = line.strip().split(',') 31 | count = 0 32 | 33 | if data[0] in code_lookup: 34 | count += 1 35 | pheno_string = dh.get_normalised_phrase(code_lookup[data[0]]) 36 | pheno_string = dh.make_pheno_string(pheno_string) 37 | #data.append(data[0] + "_" + pheno_string) 38 | data.append(data[0]) 39 | data.append(pheno_string) 40 | data.append("BINARY") 41 | print(','.join(data)) 42 | 43 | return count 44 | 45 | # execution flow starts here 46 | # 47 | start_time = time.time() 48 | parser = OptionParser() 49 | parser.add_option("-b", "--bnfcodes", dest="bnfcodes", 50 | help="bnfcodes", metavar="FILE") 51 | 52 | (options, args) = parser.parse_args() 53 | 54 | rcount = main(options) 55 | #print "END:", time.time() - start_time, "seconds", rcount 56 | -------------------------------------------------------------------------------- /sh/pheno/make_ukb_phenotypes_bnf_prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Just get the matches from the manually edited file 5 | #egrep -vw "UKBB_code|NA" ${BNFDATADIR}/results/bnf_all_manual_matches.csv > \ 6 | # ${BNFDATADIR}/results/bnf_manual_matches.csv 7 | # Cut relevant columns 8 | #cut -f 1,2,5 -d ',' ${BNFDATADIR}/results/bnf_manual_matches.csv > \ 9 | # ${BNFDATADIR}/results/bnf_manual_matches_cut.csv 10 | #cut -f 1,2,6 -d ',' ${BNFDATADIR}/results/bnf_matched.csv > \ 11 | # ${BNFDATADIR}/results/bnf_all_matches.csv 12 | 13 | # Combine auto and manually matched codes 14 | #cat ${BNFDATADIR}/results/bnf_auto_matches_cut.csv \ 15 | # ${BNFDATADIR}/results/bnf_manual_matches_cut.csv > \ 16 | # ${BNFDATADIR}/results/bnf_all_matches.csv 17 | 18 | # Assign codes where possible to all items in the reported medication list 19 | cat ${UKBPDIR}/ukb_20003_reported_medication_n.csv | \ 20 | python ${PYDIR}/pheno/assign_codes_to_participant_data.py \ 21 | --codefile=${BNFDATADIR}/results/bnf_all_matches.csv > \ 22 | ${UKBPDIR}/ukb_20003_with_bnf_codes.csv 23 | 24 | # Get list of unmatched participant medication data 25 | grep -w NA ${UKBPDIR}/ukb_20003_with_bnf_codes.csv > \ 26 | ${UKBPDIR}/ukb_20003_with_bnf_codes_unmatched.csv 27 | 28 | # Get list of matched participant medication data (without a header) this 29 | # will feed into phenotype generation 30 | grep -wv NA ${UKBPDIR}/ukb_20003_with_bnf_codes.csv | sed '1,1d' > \ 31 | ${UKBPDIR}/ukb_20003_with_bnf_codes_matched.csv 32 | 33 | -------------------------------------------------------------------------------- /py/generate_syn_dictionary.py: -------------------------------------------------------------------------------- 1 | # 2 | # Third step in the chembl synonym generation pipeline: 3 | # python ${PYDIR}/chembl/dump_mysql_table_data.py --tablename=molecule_synonyms | \ 4 | # sort -k1,1 -n | \ 5 | # python ${PYDIR}/parse_chembl_synonyms.py | \ 6 | # python ${PYDIR}/generate_syn_dictionary.py > ${CDATADIR}/syn_dict_all.txt 7 | # 8 | import time 9 | import datetime 10 | import re 11 | import os, sys 12 | import random 13 | import json 14 | 15 | def main(): 16 | """ 17 | For each synonym set of size n (record) in the input: generate n records in 18 | which each synonym is the key. 19 | 20 | But special cases need handling: 21 | Achieved by building an internal dictionary which allows for synonyms apprearing in more 22 | than 1 input record (In CHEMBL terms allows for > 1 molregno having the same synonym in its 23 | synonym set) 24 | """ 25 | count = 0 26 | synonyms = {} 27 | 28 | for line in sys.stdin: 29 | data = line.strip().split(',') 30 | type_syns = data[0].split('|') 31 | syns = [x.split(':')[1].strip() for x in type_syns] 32 | for syn in syns: 33 | if syn not in synonyms: 34 | synonyms[syn] = [] 35 | for other_syn in syns: 36 | if other_syn not in synonyms[syn]: 37 | synonyms[syn].append(other_syn) 38 | 39 | for syn in sorted(synonyms): 40 | print("{0}\t{1}".format(syn, '|'.join(set(synonyms[syn])))) 41 | count += 1 42 | return count 43 | 44 | # execution flow starts here 45 | # 46 | start_time = time.time() 47 | count = main() 48 | 49 | -------------------------------------------------------------------------------- /py/parse_chembl_synonyms.py: -------------------------------------------------------------------------------- 1 | # 2 | # Second step in the chembl synonym generation pipeline: 3 | # python ${PYDIR}/chembl/dump_mysql_table_data.py --tablename=molecule_synonyms | \ 4 | # sort -k1,1 -n | \ 5 | # python ${PYDIR}/parse_chembl_synonyms.py | \ 6 | # python ${PYDIR}/generate_syn_dictionary.py > ${CDATADIR}/syn_dict_all.txt 7 | # 8 | # 9 | import time 10 | import datetime 11 | import re 12 | import os, sys 13 | import random 14 | import json 15 | from datahelper import Datahelper 16 | 17 | def main(): 18 | """ 19 | Requires the input to be sorted on the first field (the numeric molregno). 20 | One record per molregno is output. 21 | 22 | Calls a datahelper function to normalise each word or phrase (convert to 23 | lower case and remove special characters) 24 | """ 25 | count = 0 26 | last_molno = "" 27 | related_synonyms = [] 28 | dh = Datahelper() 29 | 30 | for line in sys.stdin: 31 | data = line.strip().split('\t') 32 | if data[0] != last_molno and last_molno != "": 33 | print('|'.join(related_synonyms) + "|MOLREGNO:" + last_molno) 34 | related_synonyms = [] 35 | last_molno = data[0] 36 | text = dh.get_normalised_phrase(data[1]) 37 | stype = data[2] 38 | syn = stype + ":" + text 39 | if syn not in related_synonyms: 40 | related_synonyms.append(syn) 41 | 42 | # output the last synonym group 43 | print('|'.join(related_synonyms) + "|MOLREGNO:" + last_molno) 44 | 45 | # execution flow starts here 46 | # 47 | start_time = time.time() 48 | 49 | count = main() 50 | 51 | -------------------------------------------------------------------------------- /py/append_ukb_counts.py: -------------------------------------------------------------------------------- 1 | # 2 | # Append UKBB self-report report counts to a file 3 | # with bnf description as col 2 4 | # 5 | import time 6 | import datetime 7 | import re 8 | import os, sys 9 | import random 10 | import json 11 | from optparse import OptionParser 12 | 13 | def load_count_data(fh): 14 | ukbb_counts = {} 15 | 16 | for line in fh: 17 | data = line.strip().split(',') 18 | ukbb_counts[data[0]] = data[1] 19 | 20 | return ukbb_counts 21 | 22 | def main(options): 23 | count = 0 24 | last_molno = "" 25 | related_synonyms = [] 26 | 27 | # try to load the UK counts file 28 | try: 29 | fh = open(options.ukbcfile, "r") 30 | ukbb_counts = load_count_data(fh) 31 | except IOError as e: 32 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 33 | print("I/O error:", sys.exc_info()) 34 | exit() 35 | except TypeError as e: 36 | print("Missing arguments ", e) 37 | exit() 38 | except: 39 | #print "Unexpected error:", sys.exc_info()[0] 40 | print("Unexpected error:", sys.exc_info()) 41 | exit() 42 | 43 | for line in sys.stdin: 44 | data = line.strip().split(',') 45 | count = '0' 46 | if data[1] in ukbb_counts: 47 | count = ukbb_counts[data[1]] 48 | data.append(count) 49 | print(','.join(data)) 50 | 51 | # execution flow starts here 52 | # 53 | start_time = time.time() 54 | parser = OptionParser() 55 | # 56 | parser.add_option("-u", "--ukbcfile", dest="ukbcfile", 57 | help="UKB count file", metavar="FILE") 58 | 59 | (options, args) = parser.parse_args() 60 | 61 | count = main(options) 62 | #print "END:", time.time() - start_time, "seconds", count 63 | 64 | -------------------------------------------------------------------------------- /py/chembl/dump_sqlite_table_data.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sqlite3 3 | import os, sys 4 | from optparse import OptionParser 5 | 6 | def main(options): 7 | """ 8 | Dump out the data for any sqllite table in the chembl db as a tab-separated variable file 9 | Accepts an optional (simple) where clause and an optional row limit. 10 | """ 11 | try: 12 | chembl = sqlite3.connect(os.environ["CHFILE"]) 13 | chembl.text_factory = str 14 | except: 15 | #print "Unexpected error:", sys.exc_info()[0] 16 | print("Unexpected error:", sys.exc_info()) 17 | exit() 18 | 19 | query = "select * from {0}".format(options.tablename) 20 | 21 | if options.where_clause != None: 22 | query += " {0}".format(options.where_clause) 23 | 24 | if options.limit != None: 25 | query += " limit {0}".format(options.limit) 26 | 27 | count = 0 28 | 29 | cursor = chembl.cursor() 30 | cursor.execute(query) 31 | for row in cursor: 32 | count += 1 33 | print('\t'.join([str(elem) for elem in row])) 34 | 35 | chembl.close() 36 | 37 | return count 38 | 39 | # execution flow starts here 40 | # 41 | parser = OptionParser() 42 | 43 | parser.add_option("-t", "--tablename", dest="tablename", 44 | help="Table name", metavar="STR") 45 | 46 | parser.add_option("-w", "--where_clause", dest="where_clause", 47 | help="Optional where clause", metavar="STR") 48 | 49 | parser.add_option("-l", "--limit", dest="limit", 50 | help="Optional row limit (suggest 1 at test-time", metavar="STR") 51 | 52 | start_time = time.time() 53 | (options, args) = parser.parse_args() 54 | 55 | rcount = main(options) 56 | #print "END:", time.time() - start_time, "seconds", rcount 57 | -------------------------------------------------------------------------------- /sh/bnf_post_process_match_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Post process the BNF results files 4 | # 5 | egrep -w "NA" ${BNFDATADIR}/results/bnf_res.csv > ${BNFDATADIR}/results/bnf_res_NA.csv 6 | egrep -vw "AMB|NA" ${BNFDATADIR}/results/bnf_res.csv > ${BNFDATADIR}/results/bnf_res_NONA.csv 7 | # add UKBB counts to matches and unmatches 8 | cat ${BNFDATADIR}/results/bnf_res_NA.csv | python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > ${BNFDATADIR}/results/bnf_missing.csv 9 | cat ${BNFDATADIR}/results/bnf_res_NONA.csv | python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > ${BNFDATADIR}/results/bnf_matched.csv 10 | # one_word matches are the most risky 11 | grep ":1," ${BNFDATADIR}/results/bnf_matched.csv > ${BNFDATADIR}/results/bnf_one_word_match_full.csv 12 | cut -f 1,2,3,5,6 -d ',' ${BNFDATADIR}/results/bnf_one_word_match_full.csv > ${BNFDATADIR}/results/one_word_match_list.csv 13 | 14 | # all matched data 15 | cut -f 1,2,6 -d ',' ${BNFDATADIR}/results/bnf_matched.csv > ${BNFDATADIR}/results/bnf_matched_list.csv 16 | sort ${BNFDATADIR}/results/bnf_matched_list.csv > ${BNFDATADIR}/results/bnf_matched_list_sorted.csv 17 | # looking at missing data 18 | cut -f 1,2,8 -d ',' ${BNFDATADIR}/results/bnf_missing.csv > ${BNFDATADIR}/results/bnf_unmatched_list.csv 19 | sort ${BNFDATADIR}/results/bnf_unmatched_list.csv > ${BNFDATADIR}/results/bnf_unmatched_list_sorted.csv 20 | # unique matched UKBB ids 21 | cut -f 1,2 -d ',' ${BNFDATADIR}/results/bnf_matched.csv | sort -u > ${BNFDATADIR}/results/bnf_matched_unique.csv 22 | cut -f 1,2 -d ',' ${BNFDATADIR}/results/bnf_matched.csv | sort | uniq -c | sort -nr > ${BNFDATADIR}/results/bnf_match_counts.csv 23 | 24 | -------------------------------------------------------------------------------- /py/chembl/generate_atc_medication_annotations_sqlite.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sqlite3 3 | import os, sys 4 | from datahelper import Datahelper 5 | from optparse import OptionParser 6 | 7 | def main(options): 8 | """ 9 | Access the CHEMBL db for each input line and use the description 10 | from the appropriate level 11 | """ 12 | 13 | level = int(options.level) 14 | 15 | dh = Datahelper() 16 | try: 17 | chembl = sqlite3.connect(os.environ["CHFILE"]) 18 | chembl.text_factory = str 19 | except: 20 | #print "Unexpected error:", sys.exc_info()[0] 21 | print("Unexpected error:", sys.exc_info()) 22 | exit() 23 | 24 | print("pheno,PHENOTYPE,Category,type") 25 | 26 | count = 0 27 | for line in sys.stdin: 28 | data = line.strip().split(',') 29 | atc_code = data[0] 30 | query = "select level{0}_description from atc_classification where level{1} = '{2}' limit 1".format(level, level, atc_code) 31 | count = 0 32 | 33 | cursor = chembl.cursor() 34 | cursor.execute(query) 35 | for row in cursor: 36 | count += 1 37 | pheno_string = dh.get_normalised_phrase(row[0]) 38 | pheno_string = dh.make_pheno_string(pheno_string) 39 | data.append(data[0] + "_" + pheno_string) 40 | data.append(pheno_string) 41 | data.append("BINARY") 42 | print(','.join(data)) 43 | 44 | chembl.close() 45 | return count 46 | 47 | # execution flow starts here 48 | # 49 | start_time = time.time() 50 | parser = OptionParser() 51 | parser.add_option("-l", "--level", dest="level", 52 | help="ATC level", metavar="INT") 53 | (options, args) = parser.parse_args() 54 | 55 | rcount = main(options) 56 | #print "END:", time.time() - start_time, "seconds", rcount 57 | -------------------------------------------------------------------------------- /sh/pheno/make_ukb_phenotypes_atc_level3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # For level 3 4 | # step 1 sort to eliminate duplicates - at this stage we have level3 codes, one row per 5 | # participant per assigned ATC code 6 | echo '.1 Write level3 codes to' ${UKBPDIR} 7 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched.csv | \ 8 | python ${PYDIR}/pheno/trim_atc_code.py --codelen=4 | \ 9 | sort -u > ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level3.csv 10 | 11 | # step 2 get the list of possible phenotypes 12 | echo '.2 extract list of possible phenotype to' ${UKBPDIR} 13 | cut -f 3 -d ',' ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level3.csv | \ 14 | sort -u > ${UKBPDIR}/ukb_possible_med_phenotypes_level3.csv 15 | 16 | # step 3 generate phenotype annotations 17 | echo '.3 Get data from sqlite db, write to' ${UKBPDIR} 18 | cat ${UKBPDIR}/ukb_possible_med_phenotypes_level3.csv | \ 19 | python ${PYDIR}/chembl/generate_atc_medication_annotations_sqlite.py --level=3 > \ 20 | ${UKBPDIR}/Anno_medications_BIN_atc_level3.csv 21 | 22 | #echo '.3 Get data from mysql db, write to ${UKBPDIR}' 23 | #cat ${UKBPDIR}/ukb_possible_med_phenotypes_level3.csv | \ 24 | # python ${PYDIR}/chembl/generate_atc_medication_annotations.py --level=3 > \ 25 | # ${UKBPDIR}/Anno_medications_BIN_atc_level3.csv 26 | 27 | # step 4 generate phenotypes 28 | echo '.4 write PheWAS phenos to' ${UKBPDIR} 29 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level3.csv | \ 30 | python ${PYDIR}/pheno/generate_medication_phenotypes.py \ 31 | --pfile=${UKBPDIR}/Anno_medications_BIN_atc_level3.csv > \ 32 | ${UKBPDIR}/med_phenotypes_level3.tsv 33 | #------------------------------------------------------------------------------------------------ 34 | echo 'END' 35 | -------------------------------------------------------------------------------- /sh/pheno/make_ukb_phenotypes_atc_level2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # For level 2 4 | # step 1 sort to eliminate duplicates - at this stage we have level3 codes, one row per 5 | # participant per code 6 | echo '.1 Write level2 codes to' ${UKBPDIR} 7 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched.csv | \ 8 | python ${PYDIR}/pheno/trim_atc_code.py --codelen=3 | \ 9 | sort -u > ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level2.csv 10 | 11 | # step 2 get the list of possible phenotypes 12 | echo '.2 extract list of possible phenotype to' ${UKBPDIR} 13 | cut -f 3 -d ',' ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level2.csv | \ 14 | sort -u > ${UKBPDIR}/ukb_possible_med_phenotypes_level2.csv 15 | 16 | # step 3 generate phenotype annotations, get data from the CHEMBL db 17 | echo '.3 Get data from sqlite db, write to' ${UKBPDIR} 18 | cat ${UKBPDIR}/ukb_possible_med_phenotypes_level2.csv | \ 19 | python ${PYDIR}/chembl/generate_atc_medication_annotations_sqlite.py --level=2 > \ 20 | ${UKBPDIR}/Anno_medications_BIN_atc_level2.csv 21 | 22 | #echo '.3 Get data from mysql db, write to' ${UKBPDIR} 23 | #cat ${UKBPDIR}/ukb_possible_med_phenotypes_level2.csv | \ 24 | # python ${PYDIR}/chembl/generate_atc_medication_annotations.py --level=2 > \ 25 | # ${UKBPDIR}/Anno_medications_BIN_atc_level2.csv 26 | 27 | # step 4 generate phenotypes 28 | echo '.4 write PheWAS phenos to' ${UKBPDIR} 29 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level2.csv | \ 30 | python ${PYDIR}/pheno/generate_medication_phenotypes.py \ 31 | --pfile=${UKBPDIR}/Anno_medications_BIN_atc_level2.csv > \ 32 | ${UKBPDIR}/med_phenotypes_level2.tsv 33 | #------------------------------------------------------------------------------------------------ 34 | echo 'END' 35 | -------------------------------------------------------------------------------- /py/chembl/generate_atc_medication_annotations.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pymysql 3 | import os, sys 4 | from datahelper import Datahelper 5 | from optparse import OptionParser 6 | 7 | def main(options): 8 | """ 9 | Access the CHEMBL db for each input line and use the description 10 | from the appropriate level 11 | """ 12 | 13 | level = int(options.level) 14 | 15 | dh = Datahelper() 16 | try: 17 | chembl = pymysql.connect(host=os.environ["CHHOST"], user=os.environ["CHUSER"], passwd=os.environ["CHPWD"], 18 | port=int(os.environ["CHPORT"]), db=os.environ["CHDB"]) 19 | except: 20 | #print "Unexpected error:", sys.exc_info()[0] 21 | print("Unexpected error:", sys.exc_info()) 22 | exit() 23 | 24 | print("pheno,PHENOTYPE,Category,type") 25 | 26 | count = 0 27 | for line in sys.stdin: 28 | data = line.strip().split(',') 29 | atc_code = data[0] 30 | query = "select level{0}_description from atc_classification where level{1} = '{2}' limit 1".format(level, level, atc_code) 31 | count = 0 32 | 33 | cursor = chembl.cursor() 34 | cursor.execute(query) 35 | for row in cursor: 36 | count += 1 37 | pheno_string = dh.get_normalised_phrase(row[0]) 38 | pheno_string = dh.make_pheno_string(pheno_string) 39 | data.append(data[0] + "_" + pheno_string) 40 | data.append(pheno_string) 41 | data.append("BINARY") 42 | print(','.join(data)) 43 | 44 | chembl.close() 45 | return count 46 | 47 | # execution flow starts here 48 | # 49 | start_time = time.time() 50 | parser = OptionParser() 51 | parser.add_option("-l", "--level", dest="level", 52 | help="ATC level", metavar="INT") 53 | (options, args) = parser.parse_args() 54 | 55 | rcount = main(options) 56 | #print "END:", time.time() - start_time, "seconds", rcount 57 | -------------------------------------------------------------------------------- /sh/pheno/make_ukb_phenotypes_atc_prep.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # 4 | # Assign ATC level3 codes to participant,medication data 5 | # Includes steps to cut only the relevant columns from matched data 6 | # and to combine automatically matched data with manually 7 | # assigned codes 8 | # 9 | # Just get the matches from the manually edited file 10 | cp -f ${ATCDATADIR}/results/atc_manual_matches.csv ${ATCDATADIR}/results/atc_manual_matches.csv.BAK 11 | egrep -vw "UKBB_code|NA" ${ATCDATADIR}/results/atc_manual_matches.csv > \ 12 | ${ATCDATADIR}/results/atc_manual_matches_detail.csv 13 | # Cut relevant columns 14 | cut -f 1,2,5 -d ',' ${ATCDATADIR}/results/atc_manual_matches_detail.csv > \ 15 | ${ATCDATADIR}/results/atc_manual_matches_cut.csv 16 | cut -f 1,2,6 -d ',' ${ATCDATADIR}/results/atc_matched.csv > \ 17 | ${ATCDATADIR}/results/atc_auto_matches_cut.csv 18 | 19 | # Combine auto and manually matched codes 20 | cat ${ATCDATADIR}/results/atc_auto_matches_cut.csv \ 21 | ${ATCDATADIR}/results/atc_manual_matches_cut.csv > \ 22 | ${ATCDATADIR}/results/atc_all_matches.csv 23 | 24 | # Assign codes where possible to all items in the reported medication list 25 | cat ${UKBPDIR}/ukb_20003_reported_medication_n.csv | \ 26 | python ${PYDIR}/pheno/assign_codes_to_participant_data.py \ 27 | --codefile=${ATCDATADIR}/results/atc_all_matches.csv > \ 28 | ${UKBPDIR}/ukb_20003_with_atc_codes.csv 29 | 30 | # Get list of unmatched participant medication data 31 | grep -w NA ${UKBPDIR}/ukb_20003_with_atc_codes.csv > \ 32 | ${UKBPDIR}/ukb_20003_with_atc_codes_unmatched.csv 33 | 34 | # Get list of matched participant medication data (without a header) this 35 | # will feed into phenotype generation 36 | grep -wv NA ${UKBPDIR}/ukb_20003_with_atc_codes.csv | sed '1,1d' > \ 37 | ${UKBPDIR}/ukb_20003_with_atc_codes_matched.csv 38 | 39 | -------------------------------------------------------------------------------- /sh/atc_post_process_match_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | source ${UKBPROJROOT}/env/common 3 | # Post process the ATC results files 4 | # 5 | # differentiate between unmatched and matched data 6 | egrep -w "NA" ${ATCDATADIR}/results/atc_res.csv > \ 7 | ${ATCDATADIR}/results/atc_res_NA.csv 8 | egrep -vw "NA" ${ATCDATADIR}/results/atc_res.csv > \ 9 | ${ATCDATADIR}/results/atc_res_NONA.csv 10 | # add UKBB counts to matches and unmatches 11 | cat ${ATCDATADIR}/results/atc_res_NA.csv | \ 12 | python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > \ 13 | ${ATCDATADIR}/results/atc_missing.csv 14 | cat ${ATCDATADIR}/results/atc_res_NONA.csv | \ 15 | python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > \ 16 | ${ATCDATADIR}/results/atc_matched.csv 17 | # one_word matches are the most risky 18 | grep ":1," ${ATCDATADIR}/results/atc_matched.csv > \ 19 | ${ATCDATADIR}/results/atc_one_word_match_full.csv 20 | cut -f 1,2,3,5,6 -d ',' ${ATCDATADIR}/results/atc_one_word_match_full.csv > \ 21 | ${ATCDATADIR}/results/one_word_match_list.csv 22 | 23 | # Following steps extract data for manual examination / intervention 24 | # all matches 25 | cut -f 1,2,6 -d ',' ${ATCDATADIR}/results/atc_matched.csv > \ 26 | ${ATCDATADIR}/results/atc_matched_list.csv 27 | sort ${ATCDATADIR}/results/atc_matched_list.csv > \ 28 | ${ATCDATADIR}/results/atc_matched_list_sorted.csv 29 | # looking at missing data 30 | cut -f 1,2,8 -d ',' ${ATCDATADIR}/results/atc_missing.csv > \ 31 | ${ATCDATADIR}/results/atc_unmatched_list.csv 32 | sort ${ATCDATADIR}/results/atc_unmatched_list.csv > \ 33 | ${ATCDATADIR}/results/atc_unmatched_list_sorted.csv 34 | # unique matched UKBB ids 35 | cut -f 1,2 -d ',' ${ATCDATADIR}/results/atc_matched.csv | \ 36 | sort -u > ${ATCDATADIR}/results/atc_matched_unique.csv 37 | cut -f 1,2 -d ',' ${ATCDATADIR}/results/atc_matched.csv | \ 38 | sort | \ 39 | uniq -c | \ 40 | sort -nr > ${ATCDATADIR}/results/atc_match_counts.csv 41 | 42 | -------------------------------------------------------------------------------- /py/pheno/assign_codes_to_participant_data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Assign Classification System codes to participant data 3 | # Outputs "NA" where no code is present 4 | # 5 | import time 6 | import datetime 7 | import re 8 | import os, sys 9 | import random 10 | import json 11 | from optparse import OptionParser 12 | from datahelper import Datahelper 13 | 14 | def load_cs_codes(fh): 15 | code_lookup = {} 16 | 17 | for line in fh: 18 | data = line.strip().split(',') 19 | if data[0] not in code_lookup: 20 | code_lookup[data[0]] = [] 21 | code_lookup[data[0]].append(data[2]) 22 | 23 | return code_lookup 24 | 25 | def main(options): 26 | count = 0 27 | match_count = 0 28 | miss_count = 0 29 | dh = Datahelper() 30 | 31 | try: 32 | fh = open(options.codefile, "r") 33 | code_lookup = load_cs_codes(fh) 34 | #print len(synonyms) 35 | except IOError as e: 36 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 37 | exit() 38 | except TypeError as e: 39 | print("Missing arguments ", e) 40 | exit() 41 | except: 42 | #print "Unexpected error:", sys.exc_info()[0] 43 | print("Unexpected error:", sys.exc_info()) 44 | exit() 45 | 46 | hdr = sys.stdin.readline().strip() 47 | print("{0},{1}".format(hdr, "cs_code")) 48 | 49 | for line in sys.stdin: 50 | count += 1 51 | data = line.strip().split(',') 52 | data.append("NA") 53 | if data[1] in code_lookup: 54 | for code in code_lookup[data[1]]: 55 | data[-1] = code 56 | match_count += 1 57 | print(",".join(data)) 58 | else: 59 | miss_count += 1 60 | print(",".join(data)) 61 | 62 | return count, match_count, miss_count 63 | 64 | # execution flow starts here 65 | # 66 | start_time = time.time() 67 | parser = OptionParser() 68 | # 69 | parser.add_option("-c", "--codefile", dest="codefile", 70 | help="UKBB vs CS code file", metavar="FILE") 71 | 72 | (options, args) = parser.parse_args() 73 | 74 | count, ycount, ncount = main(options) 75 | #print "END:", time.time() - start_time, "seconds", count, ycount, ncount 76 | 77 | -------------------------------------------------------------------------------- /py/merge_chembl_synonyms.py: -------------------------------------------------------------------------------- 1 | # 2 | # Attempt to match CHEMBL synonyms with coding data 3 | # Produce output with attached synonyms, where possible 4 | # 5 | import time 6 | import datetime 7 | import re 8 | import os, sys 9 | import random 10 | import json 11 | from optparse import OptionParser 12 | from datahelper import Datahelper 13 | 14 | def load_synonyms(fh): 15 | """ 16 | Load WHOLE synonyms only into a python dictionary 17 | which will then be used as a look-up for input data 18 | Output coding data with synonyms attached, 19 | where possible 20 | """ 21 | synonyms = {} 22 | 23 | for line in fh: 24 | data = line.strip().split('\t') 25 | syns1 = data[1].split('|') 26 | syns = [x.strip() for x in syns1] 27 | for syn in syns: 28 | if syn not in synonyms: 29 | synonyms[syn] = [] 30 | for asyn in syns: 31 | if asyn not in synonyms[syn]: 32 | synonyms[syn].append(asyn) 33 | 34 | return synonyms 35 | 36 | def main(options): 37 | count = 0 38 | mcount = 0 39 | umcount = 0 40 | dh = Datahelper() 41 | 42 | try: 43 | fh = open(options.synfile, "r") 44 | synonyms = load_synonyms(fh) 45 | except IOError as e: 46 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 47 | exit() 48 | except TypeError as e: 49 | print("Missing arguments ", e) 50 | exit() 51 | except: 52 | print("Unexpected error:", sys.exc_info()) 53 | exit() 54 | 55 | for line in sys.stdin: 56 | count += 1 57 | data = line.strip().split(',') 58 | phrase = data[1].lower() 59 | matched = False 60 | for key in dh.get_merge_key_list(phrase): 61 | if key in synonyms: 62 | print("{0},{1},{2}".format(data[0], phrase, '|'.join(synonyms[key]))) 63 | matched = True 64 | mcount += 1 65 | break 66 | if matched == False: 67 | print("{0},{1}".format(data[0], phrase)) 68 | 69 | return count, mcount 70 | 71 | # execution flow starts here 72 | # 73 | start_time = time.time() 74 | parser = OptionParser() 75 | # 76 | parser.add_option("-s", "--synfile", dest="synfile", 77 | help="molecule synonyms", metavar="FILE") 78 | 79 | (options, args) = parser.parse_args() 80 | 81 | count, mcount = main(options) 82 | 83 | -------------------------------------------------------------------------------- /py/pheno/generate_medication_phenotypes.py: -------------------------------------------------------------------------------- 1 | # 2 | # Generate binary phenotypes from the set of UKB 3 | # participants reporting taking medications 4 | # Load the column list as a code vs num array 5 | # 6 | # For each person, output a line which has 0's 7 | # where the person has taken no medication in a category 8 | # and 1's where they have 9 | # HOW TO PROPERLY QC THIS? 10 | import time 11 | import datetime 12 | import re 13 | import os, sys 14 | import random 15 | import json 16 | from optparse import OptionParser 17 | 18 | def load_pheno_names(pfile): 19 | """ 20 | Here the column order for the output file is set 21 | to the order of reading the file records 22 | """ 23 | plookup = {} 24 | pcolnames = [] 25 | col = 0 26 | 27 | hdr = pfile.readline() 28 | 29 | for line in pfile: 30 | data = line.strip().split(",") 31 | plookup[data[0]] = col 32 | pcolnames.append(data[1]) 33 | col += 1 34 | 35 | return plookup, pcolnames 36 | 37 | def main(options): 38 | count = 0 39 | synonyms = {} 40 | ccol = int(options.codecol) 41 | #codes = {} 42 | 43 | try: 44 | fh = open(options.pfile, "r") 45 | plookup, pcolnames = load_pheno_names(fh) 46 | except: 47 | print("Failed to open phenotype code file {0}".format(options.pfile)) 48 | sys.exit() 49 | 50 | #print "FID,IID,%s" % (",".join(pcolnames)) 51 | print("FID\tIID\t{0}".format("\t".join(pcolnames))) 52 | #print len(pcolnames) + 2 53 | 54 | last_eid = "" 55 | phen_array = ['0'] * len(pcolnames) 56 | 57 | for line in sys.stdin: 58 | data = line.strip().split(",") 59 | if data[0] != last_eid: 60 | if last_eid != "": 61 | #print "%s,%s,%s" % (last_eid, last_eid, ",".join(phen_array)) 62 | print("{0}\t{1}\t{2}".format(last_eid, last_eid, "\t".join(phen_array))) 63 | phen_array = ['0'] * len(pcolnames) 64 | last_eid = data[0] 65 | if data[ccol] in plookup: 66 | phen_array[plookup[data[ccol]]] = '1' 67 | 68 | #print "%s,%s,%s" % (last_eid, last_eid, ",".join(phen_array)) 69 | print("{0}\t{1}\t{2}".format(last_eid, last_eid, "\t".join(phen_array))) 70 | return count 71 | 72 | # execution flow starts here 73 | # 74 | start_time = time.time() 75 | parser = OptionParser() 76 | # 77 | parser.add_option("-p", "--pfile", dest="pfile", 78 | help="phenotype code file", metavar="FILE") 79 | 80 | parser.add_option("-c", "--codecol", dest="codecol", 81 | help="column containing med code", metavar="STR") 82 | 83 | (options, args) = parser.parse_args() 84 | 85 | if options.codecol == None: 86 | options.codecol = "2" 87 | 88 | count = main(options) 89 | #print "END:", time.time() - start_time, "seconds", count 90 | 91 | -------------------------------------------------------------------------------- /py/pheno/cut_main_csv_file.py: -------------------------------------------------------------------------------- 1 | # 2 | # Cuts columns of interest from the main UKB phenotype cvs file 3 | # 4 | # Takes account of multiple column version contained within the file by matching column prefixes 5 | # in column headers 6 | # 7 | # Always outputs the eid field as the first one, followed by any column with the required prefix 8 | # 9 | import time 10 | import datetime 11 | import os, sys 12 | import csv 13 | from optparse import OptionParser 14 | 15 | start_time = time.time() 16 | sys.stdout.flush() 17 | 18 | def main(options): 19 | csvreader = None 20 | count=0 21 | idcol = int(options.idcol) 22 | 23 | try: 24 | csvfile = open(options.csvfile, "r") 25 | csvreader = csv.reader(csvfile) 26 | except IOError as e: 27 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 28 | exit() 29 | except TypeError as e: 30 | print("Missing arguments ", e) 31 | exit() 32 | except: 33 | print("Unexpected error:", sys.exc_info()) 34 | sys.exit() 35 | 36 | colprefs = options.colprefs.split(',') 37 | #print colprefs 38 | cols = [] 39 | outhdr = [] 40 | 41 | try: 42 | hdr = next(csvreader) 43 | #print len(hdr) 44 | outhdr.append(hdr[idcol]) 45 | cols.append(idcol) 46 | # Process the header record to capture column indices and build the output 47 | # header record 48 | for colpref in colprefs: 49 | for i, col in enumerate(hdr): 50 | coldata=col.split('-') 51 | if coldata[0] == colpref: 52 | outhdr.append(col) 53 | #print i, coldata 54 | cols.append(i) 55 | print(",".join(outhdr)) 56 | #print cols 57 | #print len(outhdr) 58 | 59 | for row in csvreader: 60 | outrec=[] 61 | count += 1 62 | # iterate over each row element 63 | # for i,elem in enumerate(row): 64 | # if i in cols: 65 | # outrec.append(elem) 66 | for idx in cols: 67 | outrec.append(row[idx]) 68 | print(",".join(outrec)) 69 | except: 70 | print("Unexpected error (2):", sys.exc_info()[0]) 71 | print(sys.exc_info()) 72 | sys.exit() 73 | 74 | return count 75 | # 76 | # execution flow starts here 77 | # 78 | parser = OptionParser() 79 | parser.add_option("-c", "--csvfile", dest="csvfile", 80 | help="csv file containing main UKB data", metavar="FILE") 81 | # col prefixes are comma separated - no complaint is made if a prefix doesn't exist in the data 82 | parser.add_option("-p", "--colprefs", dest="colprefs", 83 | help="UKB column prefixes", metavar="FILE") 84 | 85 | parser.add_option("-i", "--idcol", dest="idcol", 86 | help="Column numbef of id, default 0", metavar="INT") 87 | 88 | (options, args) = parser.parse_args() 89 | if options.idcol == None: 90 | options.idcol = "0" 91 | # 92 | rec_count = main(options) 93 | #print "END:", time.time() - start_time, "seconds", rec_count 94 | 95 | 96 | -------------------------------------------------------------------------------- /py/code_data_match.py: -------------------------------------------------------------------------------- 1 | # The main text match process 2 | # 3 | # Dictionaries: 4 | # 5 | # Classification system code file data fields: 6 | # 1 the code 7 | # 2 the description 8 | # 3 synonyms added from CHEMBL (separated by '|') 9 | # 10 | import time 11 | import datetime 12 | import re 13 | import string 14 | import os, sys 15 | from optparse import OptionParser 16 | from datahelper import Datahelper 17 | 18 | def main(options): 19 | """ 20 | The main match process - look up descriptions and 21 | synonyms in the coding data dictionary (loaded 22 | on initialisation 23 | SEE ALSO: datahelper.py 24 | """ 25 | dcount = 0 26 | count = 0 27 | match_count = 0 28 | miss_count = 0 29 | 30 | # try to load the classification system codes file 31 | try: 32 | fh = open(options.clsfile, "r") 33 | dh = Datahelper() 34 | dcount = dh.load_cls_phrases(fh) 35 | #print "Dictionary size = %d" % (dcount) 36 | except IOError as e: 37 | print("I/O error({0}): {1}".format(e.errno, e.strerror)) 38 | print("I/O error:", sys.exc_info()) 39 | exit() 40 | except TypeError as e: 41 | print("Missing arguments ", e) 42 | exit() 43 | except: 44 | #print "Unexpected error:", sys.exc_info()[0] 45 | print("Unexpected error:", sys.exc_info()) 46 | exit() 47 | 48 | # stdin used to read in medications coding data 49 | #hdr = sys.stdin.readline() 50 | for line in sys.stdin: 51 | count += 1 52 | matched = False 53 | data = line.strip().split(',') 54 | all_phrases = [data[1]] 55 | if len(data) == 3: 56 | all_phrases += data[2].split('|') 57 | 58 | match_string = "" 59 | code_array, match_data, last_match, selected_code = dh.match_all_phrases(all_phrases) 60 | if len(code_array) > 0: 61 | if (options.multioutput == True): 62 | # Current policy: output one line per code match (can be multiple per input record 63 | for code_elem in code_array: 64 | code_data = code_elem.split("~") 65 | print("{0},{1},{2},{3},{4},{5},{6}".format(data[0], data[1], last_match, '|'.join(match_data), code_data[1], code_data[0], len(code_array))) 66 | else: 67 | print("{0},{1},{2},{3},{4},{5},{6}".format(data[0], data[1], last_match, '|'.join(match_data), 0, selected_code, len(code_array))) 68 | match_count += 1 69 | else: 70 | print("{0},{1},{2},{3},{4},{5},{6}".format(data[0], data[1], last_match, '|'.join(match_data), "NA", "NA", 0)) 71 | miss_count += 1 72 | 73 | return count, match_count, miss_count 74 | 75 | 76 | # execution flow starts here 77 | # 78 | parser = OptionParser() 79 | parser.add_option("-c", "--clsfile", dest="clsfile", 80 | help="file contains input classification system codes and descriptions", metavar="FILE") 81 | parser.add_option("-m", "--multioutput", dest="multioutput", 82 | help="output multiple classification codes per source system line", metavar="STR") 83 | 84 | start_time = time.time() 85 | (options, args) = parser.parse_args() 86 | 87 | if options.multioutput == "Y": 88 | options.multioutput = True 89 | else: 90 | options.multioutput = False 91 | 92 | count, match_count, miss_count = main(options) 93 | #print "END:", time.time() - start_time, "seconds", count, match_count, miss_count 94 | 95 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at pdappleby@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UK Biobank Self Reported Medication Data parsing and matching 2 | ## Background 3 | UK Biobank self reported medications are represented in UK Biobank as a list of codes per participant captured at the time of the Baseline Assessment Interview. 4 | 5 | For example, participant id '000001' may have reported medications with UKBB codes 1140922174, 1140879424, 1140879616, 1197 and 2038460150. 6 | 7 | These are described in the UKBB medication codes table as: 8 | 9 | | Code | Description | Report Count | 10 | | -------- | -------------------------- | -----------: | 11 | |1140922174|alendronate sodium |6380 | 12 | |1140879424|alverine |308 | 13 | |1140879616|amitriptyline |10119 | 14 | |1197 |evening primrose oil product|1132 | 15 | |2038460150|paracetamol |100036 | 16 | 17 | There is no structure in the data and no means of grouping medication into categories such as "Drugs for Diabetes" or "Drugs to control Asthma". 18 | 19 | ## Aims 20 | Write and test software to match terms in the UK Biobank Self Reported Medication data coding table with terms in both the Anatomical Therapeutic Chemical (ATC) classification system and in the British National Formulary (BNF) coding system. The overall aim being to assign higher level well-known codes to allow grouping of the data. The resulting matched data can then be used in conjuction with the UKBB medication codes assigned at assessment time to generate evidence for use in both individual clinical phenotypes and in ranges of clincal phenotypes as found in Phenome-Wide association studies (PheWAS). 21 | 22 | ## Description 23 | Matching code is written in Python (2.7 was initially used for development, changes for Python 3 have now been made). Extensive use of Bash shell wrappers is made to supply context at run time - code and data directory locations, data file names and database access parameters. 24 | 25 | The key features to note are; 26 | 27 | - Matching is performed using combinations of whole words only (no partial word matches) and, in the case of matching synonyms to description (Step 01), whole phrase matching is used. 28 | 29 | - Synonyms from the ChEMBL database are attached to both 'sides' of the main medication term match, as part of data preparation. 30 | 31 | - During testing an excluded word list was built (cf. stop words from Natural Language Processing) to prevent the software from making unwanted one-word matches, as part of an iterative process. 32 | 33 | - All code is intended to be run from a Linux / Unix command line. 34 | 35 | The following subdirectories can be found in the repository: 36 | 37 | - *env/* Environment variables used are shown in a single file 'common_tplt', users should complete these and copy to a file named 'common', users must also pre-define the **PROJDATA** and **PROJROOT** environment variables as these are used as roots to data and code directory trees. Parameters for local chembl database access are also required for drug synonym extraction and ATC code extraction. The code example supplied is for the sqlite edition of the chembl database which can be found at: [The ChEMBL site, download section](https://www.ebi.ac.uk/chembl), the project used version 23, which is no longer the latest version at the time of writing. Downloads for other DBMS's are available. 38 | 39 | - *py/* Python scripts, scripts to match synonyms prior to matching across coding systems. Also included is the module 'datahelper.py' which is where the text matching code is to be found. There are two lower level directories where scripts to extract and format ChEMBL data and to process phenotype data reside. 40 | 41 | - *sh/* Bash shell scripts, wrappers for the python code split into several main functions and provided for coding against the ATC and BNF classification systems. 42 | 43 | Top-level scripts are prefixed '01_', '02_', '03_' and '04' for each classification system and call several lower level bash scripts. The '03_' scripts are for code assignment to data for individual medication reports in UKBB, the path and filename for UKBB phenotype data for a project must be supplied via the sourced 'common' environment parameter file. 44 | 45 | - *data/* Generated match data for both ATC and BNF coding, this does not include manually assigned coding. The excluded words list, which probably should be an independent text file is embedded in ../py/datahelper.py and can be extracted using the .../*py*/list_excl_words.py script. 46 | 47 | ## Running 48 | Once the environment has been set up (see the note on the .../*env*/common file above), four scripts are run, with parameters for either ATC or BNF code assignment, through to phenotype generation (0,1) for PheWAS. 49 | 50 | Script '01' runs data preparation steps, note that raw BNF data is not supplied in this repository due to potential licensing requirements. At this point ChEMBL synonyms and, in the case of the ATC-based match, ATC codes and terms are extracted from the ChEMBL database 51 | 52 | Script '02' runs matching scripts to 'merge' in ChEMBL synonym data and produce data on matched and unmatched UKBB medication codes. 53 | 54 | Script '03' calls scripts to extract medication detail data from the main UKBB phenotype csv file (the name of this varies by project and should be assigned in the .../*env*/common file via an environment variable) and then to assign the ATC or BNF codes output in step 02. 55 | 56 | Script '04' calls scripts to annotate and generate PheWAS phenotypes as binary 1 (CASE) or 0 (CONTROL) 57 | 58 | For ATC-based matching run: 59 | - 01_atc_prepare_sqlite.sh 60 | - 02_atc_match.sh 61 | - 03_get_ukbb_srmed_data_atc.sh 62 | - 04_make_atc_phewas_phenotypes.sh 63 | 64 | For BNF-based matching run: 65 | - 01_bnf_prepare_sqlite.sh 66 | - 02_bnf_match.sh 67 | - 03_get_ukbb_srmed_data_bnf.sh 68 | - 04_make_bnf_phewas_phenotypes.sh 69 | 70 | ## Flow Summary, steps 01 and 02 71 | ![](images/ukbb_srmed.png) 72 | 73 | ## Notice 74 | 75 | BNF coding system data was obtained and is presented in accordance with the “Open Government Licence for Public Sector Information” (http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) and the NHS Business Services Authority (NHSBSA) “Terms and Conditions for Users” (https://www.nhsbsa.nhs.uk/our-policies/terms-and-conditions). Authors: PDA, ASFD and ERJ as users of this resource also make the following statement consistent with these terms and conditions: “NHSBSA BNF Classification Coding, NHSBSA Copyright 2019” This information is licenced under the terms of the Open Government Licence. 76 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /py/datahelper.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import Counter 3 | # 4 | # Methods for assisting with matching, 5 | # generating dictionary and matching keys, 6 | # listing exclusion words and applying exclusion rules 7 | # 8 | class Datahelper: 9 | def __init__(self): 10 | self.cls_phrases = {} 11 | # Explicit exclusions - short words and measures are excluded in code 12 | # NLP systems / libraries, such as the NLTK, refer to these as "stop-words" 13 | self.excluded_words = { 14 | 'acting': 1, 15 | 'active': 1, 16 | 'activated': 1, 17 | 'artificial': 1, 18 | 'band': 1, 19 | 'coconut': 1, 20 | 'food': 1, 21 | 'biscuit': 1, 22 | 'biscuits': 1, 23 | 'good': 1, 24 | 'soft': 1, 25 | 'choice': 1, 26 | 'house': 1, 27 | 'half': 1, 28 | 'total': 1, 29 | 'alka': 1, 30 | 'alpha': 1, 31 | 'beta': 1, 32 | 'night': 1, 33 | 'nurse': 1, 34 | 'dome': 1, 35 | 'continus': 1, 36 | 'depot': 1, 37 | 'mini': 1, 38 | 'micro': 1, 39 | 'over': 1, 40 | 'long': 1, 41 | 'slow': 1, 42 | 'daily': 1, 43 | 'hayfever': 1, 44 | 'counter': 1, 45 | 'mild': 1, 46 | 'with': 1, 47 | 'other': 1, 48 | 'single': 1, 49 | 'double': 1, 50 | 'triple': 1, 51 | 'once': 1, 52 | 'flavour': 1, 53 | 'fruit': 1, 54 | 'cream': 1, 55 | 'need': 1, 56 | 'needs': 1, 57 | 'vera': 1, 58 | 'infusion': 1, 59 | 'succinate': 1, 60 | 'palmitate': 1, 61 | 'intensol': 1, 62 | 'poly': 1, 63 | 'prep': 1, 64 | 'bag': 1, 65 | 'bags': 1, 66 | 'preparation': 1, 67 | 'preparations': 1, 68 | 'preps': 1, 69 | 'shampoo': 1, 70 | 'shower': 1, 71 | 'wash': 1, 72 | 'enema': 1, 73 | 'soap': 1, 74 | 'solution': 1, 75 | 'soln': 1, 76 | 'contact': 1, 77 | 'incontinence': 1, 78 | 'diluent': 1, 79 | 'blocker': 1, 80 | 'emulsion': 1, 81 | 'emuls': 1, 82 | 'emulsifying': 1, 83 | 'lotion': 1, 84 | 'lotio': 1, 85 | 'derm': 1, 86 | 'aveeno': 1, 87 | 'soluble': 1, 88 | 'suspension': 1, 89 | 'susp': 1, 90 | 'various': 1, 91 | 'paint': 1, 92 | 'liquid': 1, 93 | 'tablet': 1, 94 | 'tablets': 1, 95 | 'pill': 1, 96 | 'pills': 1, 97 | 'perles': 1, 98 | 'pastille': 1, 99 | 'chewable': 1, 100 | 'granules': 1, 101 | 'mixture': 1, 102 | 'mixtures': 1, 103 | 'remedy': 1, 104 | 'remedies': 1, 105 | 'therapy': 1, 106 | 'therapies': 1, 107 | 'emollient': 1, 108 | 'peel': 1, 109 | 'gppe': 1, 110 | 'ointment': 1, 111 | 'effervescent': 1, 112 | 'capsule': 1, 113 | 'spansule': 1, 114 | 'caplet': 1, 115 | 'cycle': 1, 116 | 'husk': 1, 117 | 'strong': 1, 118 | 'strength': 1, 119 | 'suppository': 1, 120 | 'supplement': 1, 121 | 'compound': 1, 122 | 'comp': 1, 123 | 'caustic': 1, 124 | 'pellet': 1, 125 | 'elixir': 1, 126 | 'drops': 1, 127 | 'autohaler': 1, 128 | 'turbohaler': 1, 129 | 'inhaler': 1, 130 | 'sach': 1, 131 | 'sachet': 1, 132 | 'sachets': 1, 133 | 'syrup': 1, 134 | 'dried': 1, 135 | 'castor': 1, 136 | 'oilatum': 1, 137 | 'oily': 1, 138 | 'salt': 1, 139 | 'salts': 1, 140 | 'saline': 1, 141 | 'yeast': 1, 142 | 'tears': 1, 143 | 'ophthalmic': 1, 144 | 'complexes': 1, 145 | 'comp': 1, 146 | 'aqua': 1, 147 | 'aqueous': 1, 148 | 'hormone': 1, 149 | 'sugar': 1, 150 | 'plain': 1, 151 | 'anti': 1, 152 | 'retard': 1, 153 | 'drug': 1, 154 | 'lozenge': 1, 155 | 'lozenges': 1, 156 | 'nasal': 1, 157 | 'spray': 1, 158 | 'paste': 1, 159 | 'tincture': 1, 160 | 'oral': 1, 161 | 'injection': 1, 162 | 'injectable': 1, 163 | 'applicator': 1, 164 | 'ampoule': 1, 165 | 'syringe': 1, 166 | 'topical': 1, 167 | 'duopack': 1, 168 | 'pack': 1, 169 | 'combination': 1, 170 | 'combinations': 1, 171 | 'prefilled': 1, 172 | 'continuous': 1, 173 | 'dispersible': 1, 174 | 'patch': 1, 175 | 'gastro': 1, 176 | 'resistant': 1, 177 | 'allergy': 1, 178 | 'relief': 1, 179 | 'wool': 1, 180 | 'sand': 1, 181 | 'tube': 1, 182 | 'stnd': 1, 183 | 'aloe': 1, 184 | 'ortho': 1, 185 | 'auto': 1, 186 | 'health': 1, 187 | 'cover': 1, 188 | 'bath': 1, 189 | 'powder': 1, 190 | 'resin': 1, 191 | 'solvent': 1, 192 | 'solv': 1, 193 | 'mist': 1, 194 | 'saliva': 1, 195 | 'balsam': 1, 196 | 'tonic': 1, 197 | 'additive': 1, 198 | 'liniment': 1, 199 | 'recon': 1, 200 | 'combined': 1, 201 | 'dual': 1, 202 | 'substitute': 1, 203 | 'formula': 1, 204 | 'green': 1, 205 | 'yellow': 1, 206 | 'red': 1, 207 | 'blue': 1, 208 | 'orange': 1, 209 | 'buff': 1, 210 | 'golden': 1, 211 | 'white': 1, 212 | 'paed': 1, 213 | 'paediatric': 1, 214 | 'peppermint': 1, 215 | 'mint': 1, 216 | 'pine': 1, 217 | 'caramel': 1, 218 | 'natural': 1, 219 | 'vitamin': 1, 220 | 'vitamins': 1, 221 | 'enzyme': 1, 222 | 'product': 1, 223 | 'junior': 1, 224 | 'cold': 1, 225 | 'unknown': 1, 226 | 'free': 1, 227 | 'body': 1, 228 | 'nose': 1, 229 | 'sinus': 1, 230 | 'stomach': 1, 231 | 'scalp': 1, 232 | 'intramuscular': 1, 233 | 'sublingual': 1, 234 | 'breath': 1, 235 | 'sleep': 1, 236 | 'drowsy': 1, 237 | 'litre': 1, 238 | 'actuated': 1, 239 | 'vantage': 1, 240 | 'numark': 1, 241 | 'care': 1, 242 | 'galpharm': 1, 243 | 'merck': 1, 244 | 'pharmacy': 1, 245 | 'fish': 1, 246 | 'aluminium': 1, 247 | 'calcium': 1, 248 | 'sodium': 1, 249 | 'cromoglycate': 1, 250 | 'potassium': 1, 251 | 'chloride': 1, 252 | 'nitrate': 1, 253 | 'sulphate': 1, 254 | 'salicylic': 1, 255 | 'hydrochloride': 1, 256 | 'disodium': 1, 257 | 'zinc': 1, 258 | 'magnesium': 1, 259 | 'breathe': 1, 260 | 'wound': 1, 261 | 'citrate': 1, 262 | 'sulfate': 1, 263 | 'calmurid': 1, 264 | 'fucibet': 1, 265 | 'fucidin': 1, 266 | 'betnesol': 1, 267 | 'tobradex': 1, 268 | 'nystaform': 1, 269 | 'orabase': 1, 270 | 'betnovate': 1, 271 | 'polystyrene': 1, 272 | 'undecenoic': 1, 273 | 'oxide': 1, 274 | 'phosphate': 1, 275 | 'hydrate': 1, 276 | 'acetate': 1, 277 | 'fumarate': 1, 278 | 'sandoz': 1, 279 | 'pain': 1, 280 | 'mite': 1, 281 | 'remover': 1, 282 | 'removers': 1, 283 | 'acid': 1, 284 | 'alcohol': 1, 285 | 'coal': 1, 286 | 'extract': 1, 287 | 'mineral': 1, 288 | 'minerals': 1, 289 | 'forte': 1, 290 | 'simple': 1, 291 | 'plus': 1, 292 | 'multi': 1, 293 | 'vita': 1, 294 | 'adult': 1, 295 | 'liver': 1, 296 | 'skin': 1, 297 | 'factor': 1, 298 | 'human': 1, 299 | 'methyl': 1, 300 | 'piperazine': 1, 301 | 'deep': 1, 302 | 'ultra': 1, 303 | 'daktarin': 1, 304 | 'voltarol': 1, 305 | 'insulin': 1, 306 | 'panoxyl': 1, 307 | } 308 | self.valid_short_words = { 309 | 'gtn': 1, 310 | } 311 | 312 | def load_cls_phrases(self, fh): 313 | """ 314 | Build a dictionary of key phrases and words vs lists of codes from 315 | Classification System Data: see self.get_key_list(phrase) 316 | """ 317 | for line in fh: 318 | data = line.strip().split(',') 319 | # guards against unparseable lines 320 | if len(data) < 2: 321 | continue 322 | code = data[0] 323 | phrase_array = [data[1].lower().strip()] 324 | if len(data) > 2: 325 | syn_array = data[2].lower().strip().split('|') 326 | for syn in [s for s in syn_array if s not in phrase_array]: 327 | phrase_array.append(syn) 328 | 329 | for phrase in phrase_array: 330 | for key in set(self.get_key_list(phrase)): 331 | if key not in self.cls_phrases: 332 | self.cls_phrases[key] = [] 333 | self.cls_phrases[key].append(code) 334 | 335 | return len(self.cls_phrases) 336 | 337 | def get_phrase_dictionary(self): 338 | return self.cls_phrases 339 | 340 | def get_phrase_dictionary_keys(self): 341 | return sorted(self.cls_phrases.keys()) 342 | 343 | def get_excluded_words(self): 344 | return sorted(self.excluded_words.keys()) 345 | 346 | def match_all_phrases(self, inphrases): 347 | """ 348 | The most complicated function 349 | Attempt to match the argument phrases to the cls_phrases dictionary 350 | First attempt a match of all phrase, then all trigrams, then all 351 | bigrams, then single words 352 | 353 | Return: 354 | A list of matched_code counts, match_path, the matched phrase, the 355 | most commonly matched code(s) 356 | OR 357 | [], match path, last attempted match, None 358 | """ 359 | # temporary - attempted matches 360 | attempted_matches = [] 361 | phrase_attempts = {} 362 | phrase = "" 363 | step = "A" 364 | # ALL full phrases 365 | for phrase in inphrases: 366 | phrase_attempts[phrase] = 1 367 | attempted_matches.append(phrase + ':' + step) 368 | if phrase in self.cls_phrases: 369 | match_choices = self.cls_phrases[phrase] 370 | return (self.get_list_counts(match_choices), attempted_matches, 371 | phrase, self.get_most_common(match_choices)) 372 | 373 | # Normalised version of ALL all full phrases 374 | phrases = [self.get_normalised_phrase(p) for p in inphrases] 375 | 376 | # 3 all prefix trigrams 377 | step = "3" 378 | for ngram in [p.split()[0:3] for p in phrases if len(p.split()) > 2]: 379 | phrase = ' '.join(ngram) 380 | phrase_attempts[phrase] = 1 381 | attempted_matches.append(phrase + ':' + step) 382 | if phrase in self.cls_phrases: 383 | match_choices = self.cls_phrases[phrase] 384 | return (self.get_list_counts(match_choices), attempted_matches, 385 | phrase, self.get_most_common(match_choices)) 386 | 387 | # 2 all prefix bigrams 388 | step = "2" 389 | for ngram in [p.split()[0:2] for p in phrases if len(p.split()) > 1]: 390 | phrase = ' '.join(ngram) 391 | phrase_attempts[phrase] = 1 392 | attempted_matches.append(phrase + ':' + step) 393 | if phrase in self.cls_phrases: 394 | match_choices = self.cls_phrases[phrase] 395 | return (self.get_list_counts(match_choices), attempted_matches, 396 | phrase, self.get_most_common(match_choices)) 397 | 398 | # 1 all valid words 399 | step = "1" 400 | for phr_elem in phrases: 401 | for phrase in [w.strip() for w in phr_elem.split() 402 | if self.isExcluded(w.strip()) == False and w.strip() not in phrase_attempts]: 403 | phrase_attempts[phrase] = 1 404 | attempted_matches.append(phrase + ':' + step) 405 | if phrase in self.cls_phrases: 406 | match_choices = self.cls_phrases[phrase] 407 | return (self.get_list_counts(match_choices), attempted_matches, 408 | phrase, self.get_most_common(match_choices)) 409 | 410 | return [], attempted_matches, phrase, None 411 | 412 | def match_phrase(self, phrase): 413 | """ 414 | NOT USED CURRENTLY 415 | Attempt to match the argument phrase to the cls_phrases dictionary 416 | (built from all words passed in at init time) 417 | A phrase is matched iff: 418 | The whole string matches matches OR 419 | The prefix trigram matches OR 420 | The prefix bigram matches OR 421 | A single word, which is not an excluded word, matches 422 | 423 | Return: 424 | A matched code from the classification system or None 425 | """ 426 | key = None 427 | match_phrase = None 428 | for key in self.get_key_list(phrase): 429 | if key in self.cls_phrases: 430 | match_phrase = key 431 | break 432 | 433 | if match_phrase == None: 434 | return None, key 435 | return self.get_most_common(self.cls_phrases[match_phrase]), key 436 | 437 | def get_key_list(self, phrase): 438 | key_list = [] 439 | if self.isExcluded(phrase) == False: 440 | key_list = [phrase] 441 | 442 | ngram = self.get_normalised_phrase(phrase) 443 | if self.isExcluded(ngram) == False and ngram not in key_list: 444 | key_list.append(ngram) 445 | word_list = ngram.split() 446 | if len(word_list) > 2: 447 | key_list.append(' '.join(word_list[0:3])) 448 | if len(word_list) > 1: 449 | key_list.append(' '.join(word_list[0:2])) 450 | 451 | for word in [x for x in word_list if self.isExcluded(x.strip()) == False]: 452 | if word not in key_list: 453 | key_list.append(word) 454 | 455 | return key_list 456 | 457 | def get_merge_key_list(self, phrase): 458 | """ 459 | Get a list of keys for use while merging synonyms 460 | """ 461 | key_list = [] 462 | if self.isExcludedFromMerge(phrase) == False: 463 | key_list = [phrase] 464 | 465 | ngram = self.get_normalised_phrase(phrase) 466 | if self.isExcluded(ngram) == False and ngram not in key_list: 467 | key_list.append(ngram) 468 | word_list = ngram.split() 469 | if len(word_list) > 2: 470 | key_list.append(' '.join(word_list[0:3])) 471 | if len(word_list) > 1: 472 | key_list.append(' '.join(word_list[0:2])) 473 | 474 | for word in [x for x in word_list if self.isExcludedFromMerge(x.strip()) == False]: 475 | if word not in key_list: 476 | key_list.append(word) 477 | 478 | return key_list 479 | 480 | def get_key_list_whole_phrases(self, phrase): 481 | """ 482 | EXPERIMENTAL: Get a list of keys from whole phrases 483 | """ 484 | key_list = [phrase] 485 | ngram = self.get_normalised_phrase(phrase) 486 | key_list.append(ngram) 487 | 488 | return key_list 489 | 490 | def isExcluded(self, word): 491 | """ 492 | Used in the main match 493 | """ 494 | return ((self.isExcludedWord(word) != False) 495 | or (self.isMeasure(word) != False) 496 | or (self.isAllDigits(word) != False) 497 | or (self.isShortWord(word) != False)) 498 | 499 | def isExcludedFromMerge(self, word): 500 | """ 501 | Used when buliding dictionaries for use in 502 | synonym merging 503 | """ 504 | return ((self.isExcludedWord(word) != False) 505 | or (self.isMeasure(word) != False) 506 | or (self.isShortWord(word) != False)) 507 | 508 | def isExcludedWord(self, word): 509 | """ 510 | """ 511 | return word in self.excluded_words 512 | 513 | def get_most_common(self, lst): 514 | """ 515 | Return the most commonly occuring value in a list 516 | THIS NEEDS RE-IMPLEMENTING - not currently used 517 | """ 518 | data = Counter(lst) 519 | mc = data.most_common(2) 520 | return data.most_common(1)[0][0] 521 | 522 | def get_list_counts(self, lst): 523 | """ 524 | Return counts for data elements in a list 525 | """ 526 | counts = Counter(lst) 527 | return [c + "~" + str(counts[c]) for c in sorted(counts)] 528 | 529 | def get_best_guess(self, lst): 530 | """ 531 | NOT USED 532 | Return the best guess at a value from 533 | a list of strings 534 | """ 535 | maxlen = 0 536 | pass 537 | 538 | def make_pheno_string(self, words): 539 | return re.sub(r' +', '_', words).lower() 540 | 541 | def get_normalised_phrase(self, sentence): 542 | """ 543 | Regex to replace (multiples of) 544 | non-word characters and space with a single space 545 | """ 546 | return re.sub(r'[\W_ ]+', ' ', sentence).lower() 547 | 548 | def format_digit_code(self, code, level=3): 549 | code = code.strip() 550 | ch = code[0:2] 551 | s = code[2:4] 552 | ss = '00' 553 | if len(code) >=6: 554 | ss = code[4:6] 555 | if ss != '00' and level == 3: 556 | return "%d.%d.%d" % (int(ch),int(s),int(ss)) 557 | return "%d.%d" % (int(ch),int(s)) 558 | 559 | def format_atc_code(self, code, size=4): 560 | return code[:size] 561 | 562 | def isMeasure(self, word): 563 | """ 564 | Do we have a stand alone measure symbol 565 | """ 566 | return ((re.match('\d+(mg$|ml$|iu$|mcg$|uml$|u1ml$|mg4ml$|micrograms$|million$|cm$|mm$|unit$|units$|hb$)', word)) != None) 567 | 568 | def isAllDigits(self, word): 569 | """ 570 | Does the word consist of only digits? 571 | """ 572 | return ((re.match('^\d+$', word)) != None) 573 | 574 | def isShortWord(self, word): 575 | """ 576 | Check if the word is longer than 3 chars 577 | """ 578 | return len(word) < 4 and word not in self.valid_short_words 579 | 580 | def isSingleLetter(self, word): 581 | """ 582 | Check if the word consists of a single letter? 583 | """ 584 | return (re.match('^\w$', word)) != None 585 | -------------------------------------------------------------------------------- /data/atc_unmatched_list.csv: -------------------------------------------------------------------------------- 1 | UKBB_code,UKBB_description 2 | 1140909674,cod liver oil capsule 3 | 1140923346,co-codamol 4 | 1140916682,evening primrose oil 5 | 1140876592,multivitamin+mineral preparations 6 | 1140911732,garlic product 7 | 1140883066,insulin product 8 | 1140888538,zinc product 9 | 1140923350,co-dydramol 10 | 1140870788,calcium salts 11 | 1140865354,gaviscon liquid 12 | 1140852948,calcium+vitamin d 500units tablet 13 | 1189,co-enzyme q10/ubiquinone/bio-quinone/coenzyme q10 14 | 1199,food supplement/plant/herbal extract 15 | 1140882694,betnovate cream 16 | 1140925800,movicol oral powder 17 | 1140851812,gtn 400micrograms spray 18 | 1140865010,viscotears liquid eye gel 19 | 1141145812,minerals - magnesium 20 | 1197,evening primrose oil product 21 | 1141180036,fybogel orange s/f granules 22 | 1141168326,kliovance 1mg/0.5mg tablet 23 | 1140922804,premique 0.625mg/5mg tablet 24 | 1201,st john's wort/hypericum [ctsu] 25 | 1140878226,diprobase cream 26 | 1140911730,flax oil tablet 27 | 1140922562,femoston 1/10 tablet 28 | 1140857636,prempak 0.625 tablet 29 | 1140911680,starflower oil 30 | 1140923336,co-tenidone 31 | 1140869180,microgynon 30 tablet 32 | 1140911736,ginseng product 33 | 1205,saw palmetto product 34 | 1140871168,voltarol 25mg e/c tablet 35 | 1140865396,buscopan 10mg tablet 36 | 1203,aloe vera product 37 | 1140923402,co-amilofruse 38 | 1140876404,aqueous cream bp 39 | 1141168752,peptac liquid 40 | 1140865416,colpermin 0.2ml m/r gel e/c capsule 41 | 1141176732,carbomers 42 | 1141172918,celluvisc 1% single-use eye drops 43 | 1140923348,co-proxamol 44 | 1140878304,e45 cream 45 | 1140910640,luteine 46 | 1140917056,kliofem tablet 47 | 1140871688,solpadol caplet 48 | 1140856342,syndol tablet 49 | 1140872112,epanutin 25mg capsule 50 | 1140867504,priadel 200mg m/r tablet 51 | 1140881882,timoptol 0.25% eye drops 52 | 1140923276,co-amilozide 53 | 1140911638,kelp+garlic product 54 | 1141168122,solpadol capsule 55 | 1140864196,climagest 1mg tablet 56 | 1140911640,lecithin product 57 | 1141184726,xalacom 0.005%/0.5% eye drops 58 | 1140882776,fucibet cream 59 | 1140865414,peppermint oil product 60 | 1140883968,carmellose 61 | 1140858452,hepacon b12 1mg/1ml injection 62 | 1140871680,tylex capsule 63 | 1140881474,normacol granules 64 | 1140882618,diprosalic ointment 65 | 1140868458,hormonin tablet 66 | 1140869324,loestrin 20 tablet 67 | 1140878324,oilatum cream 68 | 1140868518,nuvelle tablet 69 | 1140878186,liquifilm tears 1.4% eye drops 70 | 1141178052,zapain caplet 71 | 1140869176,logynon tablet 72 | 1140926430,climesse tablet 73 | 1140869346,cilest tablet 74 | 1140875632,movelat gel 75 | 1140922806,premique cycle 10mg tablet 76 | 1140872036,paramax tablet 77 | 1140870488,forceval capsule 78 | 1140910698,oil of peppermint 79 | 1141172686,coaprovel 150mg/12.5mg tablet 80 | 1140927320,dermol 500 lotion 81 | 1140869164,mercilon tablet 82 | 1140882626,betnesol 0.1% eye/ear/nose drops 83 | 1140872338,madopar 62.5 capsule 84 | 1141172436,indivina 1mg/2.5mg tablet 85 | 1140862526,sodium cromoglycate 86 | 1140912212,menophase tablet 87 | 1141167206,oestrogel 0.06% gel 88 | 1141167848,asasantin retard m/r capsule 89 | 1140878184,sno-tears eye drops 90 | 1141168650,solpadeine capsule 91 | 1140864070,kapake tablet 92 | 1141188210,berocca effervescent tablet 93 | 1140882112,co-careldopa 94 | 1140869162,marvelon tablet 95 | 1141187304,codipar caplet 96 | 1141188836,felendil xl 5mg m/r tablet 97 | 1140911636,kalms tablet 98 | 1140878498,polytar liquid 99 | 1140888578,antihypertensive 100 | 1141185986,cetraben emollient cream 101 | 1140876384,glandosane plain spray 102 | 1140921088,tridestra tablet 103 | 1141202030,estradot 25micrograms patch 104 | 1141173872,cetraben cream 105 | 1140864618,zestoretic 10 tablet 106 | 1141168648,solpadeine tablet 107 | 1140917450,oestrogel 1.25g gel 108 | 1140875630,movelat cream 109 | 1140882374,co-amoxiclav 110 | 1140868538,sustanon 100 oily injection 111 | 1140865762,regulan 3.6g/sachet powder 112 | 1141195836,dermol cream 113 | 1140888432,potassium product 114 | 1140878248,oilatum emollient bath additive 115 | 1140873780,co-trimoxazole 116 | 1140881414,gastrocote liquid 117 | 1141179824,yasmin tablet 118 | 1140868520,estracombi tts patch 119 | 1140878190,minims artificial tears single-use eye drops 120 | 1140876006,polyvinyl alcohol 1% eye drops 121 | 1140869186,ovranette tablet 122 | 1141189134,stalevo 50mg / 12.5mg / 200mg tablet 123 | 1141165512,kapake capsule 124 | 1140878286,diprobase ointment 125 | 1140868514,trisequens tablet 126 | 1141200400,amlostin 5mg tablet 127 | 1140871996,sanomigran 500micrograms tablet 128 | 1140865686,dulco-lax 5mg e/c tablet 129 | 1140871682,solpadol effervescent tablet 130 | 1141167140,exorex lotion 131 | 1140882542,fucidin cream 132 | 1140928880,geltears gel 133 | 1140876312,emulsifying ointment bp 134 | 1140870800,sandocal 400 effervescent tablet 135 | 1140879482,antacid tablet 136 | 1140878308,aveeno cream 137 | 1140878236,calmurid cream 138 | 1141178054,zapain capsule 139 | 1140870284,prostap sr 3.75mg injection (pdr for recon)+diluent+kit 140 | 1140865418,mintec 0.2ml e/c capsule 141 | 1140865658,lomotil tablet 142 | 1140880234,dianette tablet 143 | 1140869334,femodene tablet 144 | 1140865170,dried yeast 300mg tablet 145 | 1141163146,beclo-aqua 50 nasal spray 146 | 1140910566,glyclizide 147 | 1140875596,algesal cream 148 | 1140916342,beta-blocker 149 | 1140861884,maxepa 1g capsule 150 | 1140883162,combined oral contraceptive product 151 | 1141172628,almogran 12.5mg tablet 152 | 1141168374,dermol 200 shower emollient 153 | 1140867152,depixol 3mg tablet 154 | 1140882464,daktarin 2% cream 155 | 1140882110,co-beneldopa 156 | 1141186750,biotene oralbalance oral gel 157 | 1140866402,dyazide tablet 158 | 1140917128,imedeen tablet 159 | 1141187790,micardisplus 40mg/12.5mg tablet 160 | 1140876424,cocois ointment 161 | 1141200878,oilatum bath formula liquid bath additive 162 | 1140925778,paracodol capsule 163 | 1140881334,co-phenotrope 164 | 1140865840,predfoam 20mg enema 165 | 1140871684,remedeine tablet 166 | 1140876394,salivix pastille 167 | 1140860838,gtn 300micrograms sublingual tablet 168 | 1140874794,betnesol 500mcg soluble tablet 169 | 1140883014,ortho-dienoestrol 0.01% cream 170 | 1140877600,halibut-liver oil capsule 171 | 1140870486,folicin tablet 172 | 1140866420,moduretic tablet 173 | 1141164652,unguentum m cream 174 | 1140856442,solpadeine soluble effervescent tablet 175 | 1141166368,femodette tablet 176 | 1140869266,trinovum tablet 177 | 1140868794,sodium clodronate 178 | 1141185480,ispagel 3.5g/sachet s/f powder 179 | 1140876026,simple eye ointment 180 | 1140869256,brevinor tablet 181 | 1141189064,solpadeine plus soluble effervescent tablet 182 | 1140865366,maalox plus suspension 183 | 1140857200,calcium sulphaloxate 184 | 1140865380,kolanticon gel 185 | 1141173444,dulco-lax 2.5mg perles 186 | 1140910730,cromoglycate 187 | 1140864346,cocois scalp ointment 188 | 1140860324,tenoret 50 tablet 189 | 1140863552,paramol tablet 190 | 1140888644,emollient product 191 | 1140882778,lotriderm cream 192 | 1141181882,betaferon 300micrograms injection (pdr for recon)+diluent 193 | 1140865808,lactugal solution 194 | 1141193170,eccoxolac 300mg capsule 195 | 1140878052,fml eye drops 196 | 1140871174,voltarol 100mg suppository 197 | 1140867952,fluanxol 500micrograms tablet 198 | 1140882976,growth hormone product 199 | 1140877890,transvasin cream 200 | 1140871686,remedeine forte tablet 201 | 1140871162,vitamins capsule bpc 202 | 1140868804,bonefos 400mg capsule 203 | 1140866352,navispare tablet 204 | 1140881622,co-danthramer 205 | 1140881318,co-magaldrox 206 | 1140923272,co-triamterzide 207 | 1140888628,hydrocortistab 1% cream 208 | 1140861922,lipid lowering drug 209 | 1140926686,femapak 40 patch+tablet 210 | 1140881324,magnesium trisilicate 211 | 1140871920,dhc continus 60mg m/r tablet 212 | 1140860784,innozide tablet 213 | 1140860358,tenif capsule 214 | 1141189008,solpadeine plus capsule 215 | 1140880726,hirudoid cream 216 | 1140878170,hypotonic artificial tears eye drops 217 | 1140874950,prednesol 5mg tablet 218 | 1140873930,septrin 480mg tablet 219 | 1141179944,galpharm hayfever and allergy relief 10mg tablet 220 | 1141167748,solpadeine max tablet 221 | 1141167678,pharmaton capsule 222 | 1140881422,asilone liquid 223 | 1140872798,augmentin 375mg tablet 224 | 1140871004,vitamins b+c 225 | 1140860328,tenoretic tablet 226 | 1141180766,novofem tablet 227 | 1140888630,hydrocortisyl 1% cream 228 | 1140880056,sulphur product 229 | 1140861416,paroven 250mg capsule 230 | 1141189010,solpadeine plus tablet 231 | 1140888456,dioralyte product 232 | 1140869190,trinordiol tablet 233 | 1140856348,veganin tablet 234 | 1141188766,duac once daily gel 235 | 1140881714,capozide tablet 236 | 1140878182,hypotears eye drops 237 | 1140876350,cod liver oil+zinc oxide 11.4/38% ointment 238 | 1140872734,augmentin 625mg tablet 239 | 1140861418,oxerutins 250mg capsule 240 | 1140910734,disodium cromoglycate 241 | 1140878288,unguentum merck cream 242 | 1140878280,ultrabase cream 243 | 1141145830,fybozest orange 3.5g s/f granules 244 | 1140911760,beechams powder 245 | 1140870900,phosphate-sandoz tablet 246 | 1140856436,propain tablet 247 | 1141190656,kapake 30/500 effervescent tablet 248 | 1140925930,movelat relief gel 249 | 1140916948,e45 lotion 250 | 1140878350,carbo-dome cream 251 | 1140869262,ovysmen tablet 252 | 1140869260,norimin tablet 253 | 1140869254,binovum tablet 254 | 1140865358,maalox tablet 255 | 1140856336,codis dispersible tablet 256 | 1141200882,oilatum bath formula liquid bath additive 300ml 257 | 1141193140,evening primrose oil 20% cream 258 | 1141185108,aveeno lotion 259 | 1141172966,propain caplet 260 | 1140882544,fucidin ointment 261 | 1140878222,artificial saliva 262 | 1140870786,ketovite tablet 263 | 1140868260,distalgesic tablet 264 | 1140867342,clopixol 2mg tablet 265 | 1140866318,spirolone 25mg tablet 266 | 1140865724,sodium picosulphate 267 | 1141172758,as saliva orthana spray 268 | 1140926626,saliveze spray 269 | 1140925936,movelat relief cream 270 | 1140917114,alka-seltzer tablet 271 | 1140876498,soya oil 84.75% bath oil 272 | 1140870840,solvazinc 200mg effervescent tablet 273 | 1140865368,mucogel suspension 274 | 1140860398,kalten capsule 275 | 1191,indigestion remedy (over the counter) 276 | 1141194228,numark hayfever and allergy relief 10mg tablet 277 | 1140881624,co-danthrusate 278 | 1140870328,pregaday tablet 279 | 1140866416,moduret 25 tablet 280 | 1141191198,optrex eye drops 281 | 1141186902,bioxtra oral gel 282 | 1141175756,migramax sachet powder 283 | 1141162940,e45 emollient bath oil 284 | 1140910728,clodronate disodium 285 | 1140878426,polytar emollient bath additive 286 | 1140875628,axsain cream 287 | 1140870102,interferons 288 | 1140856454,df118 30mg tablet 289 | 1140852996,metatone tonic 290 | 1141200092,eflornitine 11.5% cream 291 | 1141188530,crampex tablet 292 | 1141172224,acidex oral suspension 293 | 1141162982,muse 125micrograms pellet 294 | 1140884298,ethambutolol 295 | 1140868258,aspav dispersible tablet 296 | 1141146084,oralbalance oral gel 297 | 1140911574,saline 0.9% nose drops 298 | 1140910614,prindolol 299 | 1140878258,diprobath bath additive 300 | 1140873818,colomycin 1million units injection (pdr for recon) 301 | 1141165476,triapin mite 2.5mg/2.5mg tablet 302 | 1140917062,natrasleep tablet 303 | 1140910552,folate product 304 | 1140909938,coc - combined oral contraceptives 305 | 1140881894,diuretic 306 | 1140876164,dexa-rhinaspray nasal spray 307 | 1141195474,zerobase cream 308 | 1141191774,optrex allergy eye drops 309 | 1140928346,gastrocote s/f liquid 310 | 1140923344,co-codaprin 311 | 1140910794,picosulphate 312 | 1140881418,premiums tablet 313 | 1140871926,dhc continus 120mg m/r tablet 314 | 1140870796,calcium-sandoz syrup 315 | 1140865370,topal tablet 316 | 1140856340,solpadeine forte dispersible tablet 317 | 1141195844,dermol cream 500g 318 | 1141175766,co-cyprindiol 319 | 1141156858,domperamol tablet 320 | 1141146044,laxoberal 5mg/5ml liquid 321 | 1140921988,tylex effervescent soluble tablet 322 | 1140910572,nitroglycerol 323 | 1140882422,co-fluampicil 324 | 1140876496,soya oil+mixed lauromacrogols 82.95/15% bath oil 325 | 1140873934,septrin forte 960mg tablet 326 | 1140862960,visclair 100mg tablet 327 | 1140860736,accuretic tablet 328 | 1141192638,sebco ointment 329 | 1141189626,liquivisc 0.25% eye gel 330 | 1141188502,pollenase 50micrograms nasal spray 331 | 1141179954,care hayfever relief 50micrograms nasal spray 332 | 1141165754,librofem 200mg tablet 333 | 1140926516,optrex dry eye therapy eye drops 334 | 1140923282,co-flumactone 335 | 1140911560,nambumetone 336 | 1140882468,daktarin 25mg/ml oral gel 337 | 1140878488,t/gel shampoo 338 | 1140878194,lubrifilm eye ointment 339 | 1140877866,ibrufhalal 200mg tablet 340 | 1140876314,hydrous ointment bp 341 | 1140876008,hydroxyethylcellulose 342 | 1140871830,diconal tablet 343 | 1140866396,aldactide 25 tablet 344 | 1140863440,meprate 400mg tablet 345 | 1141189680,ultramol capsule 346 | 1141189572,nasivin 0.05% nasal spray 347 | 1141188900,sulazine ec 500mg e/c tablet 348 | 1141157388,covonia bronchial balsam syrup 349 | 1140911756,askit powder 350 | 1140911678,waterfall tablet 351 | 1140880314,daktarin 0.16% powder spray 352 | 1140876608,feroglobin b12 syrup 353 | 1140876118,cerumol ear drops 354 | 1140868800,loron 400mg capsule 355 | 1140866412,lasilactone capsule 356 | 1140852908,multivite pellet 357 | 1140851150,laxoberal 5mg/5ml elixir 358 | 1207,unknown supplement 359 | 1141189598,adult meltus dry coughs with congestion oral liquid 360 | 1141184672,e45 itch relief cream 361 | 1141166848,dexa-rhinaspray duo aqueous nasal spray 362 | 1141163656,oculotect 5% single-use eye drops 363 | 1140925942,caprin 75mg e/c tablet 364 | 1140910754,polyacrylic acid 365 | 1140909910,aveeno bath oil 366 | 1140882768,vista-methasone n eye/ear/nose drops 367 | 1140879466,antacid liquid 368 | 1140878624,collodion product 369 | 1140878254,bath e45 bath oil 370 | 1140878224,lipobase cream 371 | 1140878216,orabase oral paste 372 | 1140876388,saliva orthana spray 373 | 1140874140,ciproxin 250mg tablet 374 | 1140869340,triadene tablet 375 | 1140868082,valoid 50mg tablet 376 | 1140866408,frusene tablet 377 | 1140865228,topical anti-inflammatory prep[1] 378 | 1140862912,simple linctus 379 | 1140857716,estrovis 4mg tablet 380 | 1140852970,vita-e 75iu tablet 381 | 1141195842,dermol cream 100g 382 | 1141181186,co-zidocapt 25mg/12.5mg tablet 383 | 1141173798,aerodiol 150micrograms nasal spray 384 | 1141172704,trizivir tablet 385 | 1141172298,sst tablet 386 | 1141157480,co-magaldrox product 387 | 1141151346,ciproxin 100mg tablet 388 | 1140911816,strepsils lozenge 389 | 1140910558,fusidate sodium 390 | 1140910416,ung emuls - ungentum emulsificans 391 | 1140882624,vista-methasone 0.1% eye/ear/nose drops 392 | 1140882018,aller-eze tablet 393 | 1140880336,colomycin topical powder 394 | 1140880324,nystaform cream 395 | 1140874084,rifinah 300 tablet 396 | 1140870596,slow sodium 600mg m/r tablet 397 | 1140869414,cystopurin 3g/sachet granules 398 | 1140868972,danol 100mg capsule 399 | 1140868286,paracodol soluble tablet 400 | 1140865552,liquorice 401 | 1140861410,opilon 40mg tablet 402 | 1140854254,dermacare cream 100ml 403 | 1140852900,juvel tablet 404 | 1140850748,mucogel tablet 405 | 1141200880,oilatum bath formula liquid bath additive 150ml 406 | 1141200726,lisicostad hct 10/12.5mg tablet 407 | 1141193276,pepcidtwo chewable indigestion tablet 408 | 1141192718,galpharm non-drowsy allergy relief 10mg tablet 409 | 1141189606,eumobase cream 410 | 1141188790,peptobismol 1.752% suspension 411 | 1141181398,liposic eye gel 412 | 1141173536,gelclair 15ml/sachet oral gel 413 | 1141168824,heliclear triple pack 414 | 1141167990,oilatum fragrance free liquid bath additive 415 | 1140923850,betaferon 9.6 million iu injection (pdr for recon)+diluent 416 | 1140909904,tri-iodothyronine product 417 | 1140888666,nitrate vasodilator 418 | 1140883818,vaseline dermacare cream 419 | 1140882998,miscellaneous cystitis remedies 420 | 1140882964,oral hypoglycaemic 421 | 1140881342,sodium acid phosphate 422 | 1140879396,codalax liquid 423 | 1140876260,bonjela 8.7% oral gel 424 | 1140873932,septrin 480mg dispersible tablet 425 | 1140873830,fucidin 250mg tablet 426 | 1140872808,augmentin 125mg/31mg/5ml s/f suspension 427 | 1140869174,eugynon 30 tablet 428 | 1140865832,predenema 20mg/100ml standard tube retention enema 429 | 1140865758,isogel granules 430 | 1140865702,normax capsule 431 | 1140860330,tolerzide tablet 432 | 1140856428,paramol 10/500mg tablet 433 | 1140851272,anacal suppository 434 | 1140851218,anacal ointment 435 | 1141194948,vantage pharmacy sleep aid 50mg tablet 436 | 1141193808,colomycin 2million units injection (pdr for recon) 437 | 1141189678,ultramol tablet 438 | 1141188180,beechams all in one syrup 439 | 1141175200,medocodene 30/500 capsule 440 | 1141174552,tobradex eye drops 441 | 1141173574,night nurse capsule 442 | 1141168116,refresh 1.4% ophthalmic solution 443 | 1141157476,co-beneldopa product 444 | 1141157472,calcium polystyrene sulphonate product 445 | 1141152366,daktarin dual action 2% cream 446 | 1140927388,remedeine effervescent tablet 447 | 1140927384,remedeine forte effervescent tablet 448 | 1140927202,mxl 90mg m/r capsule 449 | 1140916984,emulsifying soap 450 | 1140910564,glybenclamide 451 | 1140910026,zinc+castor oil cream bp 452 | 1140909918,biosynthetic human growth hormone 453 | 1140909470,otex ear drops 454 | 1140880316,daktarin 2% dusting powder 455 | 1140879426,antiemetic 456 | 1140879390,co-simalcite 457 | 1140878306,calendolon ointment 458 | 1140878282,k/l dry skin cream 459 | 1140878026,ralgex spray 460 | 1140874146,ciproxin 500mg tablet 461 | 1140874082,rifinah 150 tablet 462 | 1140874080,rifater tablet 463 | 1140871228,fenopron 300mg tablet 464 | 1140869264,synphase tablet 465 | 1140868470,estrapak 50micrograms/1mg patch+tablet 466 | 1140867860,faverin 50mg tablet 467 | 1140865822,anacal rectal ointment 468 | 1140865294,gastron tablet 469 | 1140864536,df118 forte 40mg tablet 470 | 1140864408,fleet enema 471 | 1140863028,welldorm tablet 472 | 1140860334,trasidrex tablet 473 | 1140857990,minovlar tablet 474 | 1140855838,evacalm 2mg tablet 475 | 1140855520,duo-autohaler inhaler 476 | 1140855332,iso-autohaler 80micrograms inhaler 477 | 1140851692,capozide 50mg tablets x28 478 | 1141201322,dermablend leg and body natural cover cream 479 | 1141200458,denzapine 25mg tablet 480 | 1141193146,gammaderm cream 481 | 1141192286,aller-eze 0.05% eye drops 482 | 1141190006,propain plus caplet 483 | 1141184648,human luteinising hormone product 484 | 1141182650,day and night nurse capsule 485 | 1141173902,vivioptal capsule 486 | 1141173572,night nurse oral solution 487 | 1141145658,angiotensin ii receptor antagonist 488 | 1140927338,compound coconut ointment 489 | 1140927204,mxl 120mg m/r capsule 490 | 1140923682,tricalcium phosphate 3.3g/sachet powder 491 | 1140923648,methodex 1mg/1ml mixture 492 | 1140923404,co-prenozide 493 | 1140917452,metazem 60mg m/r tablet 494 | 1140916354,ailax suspension 495 | 1140911598,saline 0.9% topical solution 496 | 1140910776,cantassium vitamin b6 50mg tablet 497 | 1140910726,disodium clodronate 498 | 1140910674,ethinylnortestosterone 499 | 1140910634,deltahydrocortisone 500 | 1140909722,amipramizide 501 | 1140888766,nacl - sodium chloride 502 | 1140888460,electrolade product 503 | 1140884422,dipipanone 504 | 1140884140,daktarin powder 505 | 1140883810,locobase cream 506 | 1140882546,fucidin gel 507 | 1140882376,sulpitil 200mg tablet 508 | 1140882276,enterosan tablet 509 | 1140881416,mucaine suspension 510 | 1140880268,alcoholic coal tar extract 5% shampoo 511 | 1140880154,bromine complexes 512 | 1140880058,sulphur+salicylic acid cream bp 513 | 1140879506,effico tonic 514 | 1140879412,phosphate enema 515 | 1140878664,mycota powder 516 | 1140878610,mycota cream 517 | 1140878262,zeasorb dusting powder 518 | 1140878228,vita-e ointment 519 | 1140877696,abidec drops 520 | 1140876646,calcium polystyrene sulphonate 521 | 1140876318,hydrous wool fat ointment bp 522 | 1140872918,magnapen capsule 523 | 1140872816,augmentin 1.2g injection (pdr for recon) 524 | 1140872802,augmentin 375mg s/f dispersible tablet 525 | 1140872032,migravess effervescent tablet 526 | 1140871924,dhc continus 90mg m/r tablet 527 | 1140871028,becosym tablet 528 | 1140869282,noristerat 200mg/1ml oily injection 529 | 1140868090,dimenhydrinate 530 | 1140865894,dulco-lax 5mg paediatric suppository 531 | 1140864176,monozide 10 tablet 532 | 1140861568,minihep calcium 5000iu/0.2ml injection 533 | 1140860406,moducren tablet 534 | 1140860338,viskaldix tablet 535 | 1140856806,amoxidin 500mg capsule 536 | 1140856418,panadeine forte tablet 537 | 1140856416,panadeine tablet 538 | 1140856406,medocodeine tablet 539 | 1140856332,antoin dispersible tablet 540 | 1140856312,claradin 300mg tablet 541 | 1140855816,congesteze 120mg/1mg tablet 542 | 1140852894,dalivit capsule 543 | 1141200876,oilatum junior bath formula liquid bath additive 300ml 544 | 1141200872,oilatum junior bath formula liquid bath additive 545 | 1141200736,galpharm flu strength all in one s/f oral solution 546 | 1141200470,acnocin 2000/35 tablet 547 | 1141200110,galsud 30mg/5ml linctus 548 | 1141199942,tilolec 100mg/25mg m/r tablet 549 | 1141199916,galpharm heartburn relief 10mg e/c tablet 550 | 1141195034,salinum sugar free oral solution 551 | 1141194946,oilatum fragrance free junior liquid bath additive 552 | 1141193210,day nurse capsule 553 | 1141191194,witch hazel product 554 | 1141190152,dymotil tablet 555 | 1141189772,gonapeptyl depot 3.75mg inj (pdr for recon)+solv p/f syringe 556 | 1141188676,carglutamic acid 557 | 1141188504,pollenase allergy 2% eye drops 558 | 1141186802,viraferonpeg 150mcg pdr+solv for soln for inj prefilled pen 559 | 1141186800,viraferonpeg 120mcg pdr+solv for soln for inj prefilled pen 560 | 1141186794,viraferonpeg 80mcg pdr+solv for soln for inj prefilled pen 561 | 1141184174,clarithrom tab+lansopraz cap+metronidaz tab 500/30/400mg pck 562 | 1141176172,silgel cream 563 | 1141173956,oilatum junior cream 564 | 1141170516,daktarin dual action 0.16% powder spray 565 | 1141168848,rinstead contact pastille 566 | 1141164618,ketil 2.5% gel 567 | 1141163094,dayleve 0.1% cream 568 | 1141157438,ciproxin 5g/100ml oral suspension 569 | 1141152070,ampitrin 125mg/5ml oral suspension 570 | 1141150478,decubal cream 571 | 1141150430,dermacare cream 150ml 572 | 1140928624,frusemek 5mg/40mg tablet 573 | 1140928266,solpaflex tablet 574 | 1140928260,panadeine co tablet 575 | 1140927624,lassar's paste 576 | 1140926360,alphaparin 3000iu/0.3ml prefilled syringe 577 | 1140923752,meronem 1g infusion kit 578 | 1140922936,enlive 579 | 1140922344,dermamist spray 580 | 1140921652,levorphanol 581 | 1140913318,rhdnase 582 | 1140913038,colomycin topical powder 1g 583 | 1140910988,vita-e cream 584 | 1140910818,butamidum 585 | 1140910802,androstanazol 586 | 1140910780,cantassium vitamin e 200iu capsule 587 | 1140910664,benzoxazocine 588 | 1140910644,fenopraine 589 | 1140910642,diprazinum 590 | 1140910620,aldadiene potassium 591 | 1140910602,amidine 592 | 1140910428,sa - salicylic acid 593 | 1140910370,meclastine 594 | 1140909880,hydroxycholecalciferol 595 | 1140909734,fortespan spansule 596 | 1140909428,earex ear drops 597 | 1140888874,salivace sugar free spray 598 | 1140888462,gluco-lyte 599 | 1140883528,epifrin 1% eye drops 600 | 1140882220,paracets 500mg capsule 601 | 1140882146,uniflu plus gregovite c tablet 602 | 1140882106,femigraine tablet 603 | 1140881412,algicon suspension 604 | 1140880458,zinc undecenoate+undecenoic acid 20/5% cream 605 | 1140880166,podophyllum resin 606 | 1140880018,zinc+salicylic acid paste bp 607 | 1140879930,strong coal tar solution+pine tar 5/5% gel 608 | 1140879674,pipothiazine 609 | 1140878608,monphytol paint 610 | 1140878586,calmurid solution 611 | 1140878316,lacticare lotion 612 | 1140878312,kamillosan ointment 613 | 1140878300,alcoderm cream 614 | 1140878242,aveeno oilated bath additive 615 | 1140877744,steripod blue topical liquid 616 | 1140877706,minadex sugar-free oral drops 617 | 1140876422,soap substitute+zinc oxide 5% cream wash 618 | 1140876338,zinc+castor oil ointment bp 619 | 1140876336,zinc 15% ointment bp 620 | 1140876330,flexible collodion bp 621 | 1140876324,simple ointment bp 622 | 1140876316,hydrous wool fat bp 623 | 1140876214,tyrocane lozenge 624 | 1140875594,white liniment bp 625 | 1140874954,hydrocortistab 20mg tablet 626 | 1140874410,fansidar tablet 627 | 1140874116,metrozol 500mg/100ml infusion 628 | 1140874000,zinamide 500mg tablet 629 | 1140873936,septrin adult suspension 630 | 1140873834,fucidin 500mg i-v infusion+buffer 631 | 1140873812,colomycin 1.5million units tablet 632 | 1140873798,bactrim 480mg tablet 633 | 1140873696,erythromid ds 500mg e/c tablet 634 | 1140873694,erythromid 250mg e/c tablet 635 | 1140872976,rimoxallin 500mg capsule 636 | 1140872826,augmentin 250mg/62mg/5ml s/f suspension 637 | 1140871066,gentian alkaline mixture 638 | 1140871034,vigranon b syrup 639 | 1140870492,octovit tablet 640 | 1140870480,fefol-vit spansule 641 | 1140870310,ferfolic sv tablet 642 | 1140870308,fefol spansule 643 | 1140870104,introna-2b 3million iu injection (pdr for recon)+diluent 644 | 1140869338,tri-minulet tablet 645 | 1140869272,neogest tablet 646 | 1140869258,neocon 1/35 tablet 647 | 1140869188,schering pc4 tablet 648 | 1140869184,ovran 30 tablet 649 | 1140869032,dienoestrol 650 | 1140868512,syntex menophase tablet 651 | 1140868280,cosalgesic tablet 652 | 1140868076,cinaziere 15mg tablet 653 | 1140867988,dramamine 50mg tablet 654 | 1140867734,concordin 5mg tablet 655 | 1140866692,beta-adrenoceptor blocking drug 656 | 1140866442,diumide-k continus m/r tablet 657 | 1140866418,fru-co tablet 658 | 1140866410,kalspare tablet 659 | 1140866404,dytide capsule 660 | 1140866400,amil-co tablet 661 | 1140866328,triam-co tablet 662 | 1140866008,rowachol capsule 663 | 1140865760,metamucil powder 664 | 1140865548,pyrogastrone tablet 665 | 1140865478,tripotassium dicitratobismuthate 120mg tablet 666 | 1140864808,tropergen tablet 667 | 1140864562,introna 25million iu/5ml injection solution 668 | 1140864502,testotop tts 15mg transdermal patch 669 | 1140863034,chlormethiazole 670 | 1140862124,exirel 200micrograms inhaler 671 | 1140861776,antiplatelet drug 672 | 1140861444,saventrine 30mg tablet 673 | 1140860410,prestim tablet 674 | 1140860348,atenixco 50mg/12.5mg tablet 675 | 1140859776,pernomol paint 676 | 1140859282,nitrophenol 677 | 1140858378,ironorm capsule 678 | 1140858324,medroxyprogest 80mg/ml suspension 100ml 679 | 1140858310,gastrovite tablet 680 | 1140858306,fesovit-z m/r capsule 681 | 1140857920,minilyn tablet 682 | 1140857628,gestone 10mg/1ml injection 683 | 1140857198,septrin 960mg/3ml intramuscular injection 684 | 1140856754,ciclacillin 685 | 1140856456,df118 10mg/5ml elixir 686 | 1140856422,paradeine tablet 687 | 1140856410,neurodyne capsule 688 | 1140856214,solprin 300mg dispersible tablet 689 | 1140856114,durophet 7.5mg m/r capsule 690 | 1140856040,methyprylone 691 | 1140855890,dormonoct 1mg tablet 692 | 1140855870,almazine 1mg tablet 693 | 1140855426,biophylline 350mg m/r tablet 694 | 1140854432,cortacream 1% band 695 | 1140854256,dermacare lotion 75ml 696 | 1140854112,merocaine lozenge 697 | 1140854000,hayphryn nasal spray 698 | 1140853986,neophryn 0.5% nasal spray 699 | 1140853676,opulets sodium chloride single-use eye drops 0.5ml 700 | 1140853440,lachesine chloride 701 | 1140852904,minamino compound syrup 702 | 1140852884,calcimax syrup 703 | 1140852876,tonivitan syrup 704 | 1140852872,lipotriad capsule 705 | 1140851360,brinaldix k tablet 706 | 1140851306,dehydrocholic acid 707 | 1140851278,betnovate compound suppository 708 | 1140851128,hamamelis 200mg suppository 709 | 1140851066,trifyba 250g powder 710 | 1140851064,lejfibre 10g biscuits 711 | 1140851062,fybranta 2g tablet 712 | 1140850932,bellocarb tablet 713 | 1140850720,gastrils 500mg green (mint) pastille 714 | 1140850714,droxalin tablet --------------------------------------------------------------------------------