├── images
    └── ukbb_srmed.png
├── sh
    ├── chembl
    │   └── get_chembl_atc_classification_data_sqlite.sh
    ├── 04_make_bnf_phewas_phenotypes.sh
    ├── pheno
    │   ├── medication_pheno_transpose.sh
    │   ├── medication_pheno_extract.sh
    │   ├── normalise_bnf_codes.sh
    │   ├── cut_pheno_cols.sh
    │   ├── transpose_pheno_data.sh
    │   ├── make_ukb_phenotypes_bnf.sh
    │   ├── make_ukb_phenotypes_bnf_prep.sh
    │   ├── make_ukb_phenotypes_atc_level3.sh
    │   ├── make_ukb_phenotypes_atc_level2.sh
    │   └── make_ukb_phenotypes_atc_prep.sh
    ├── 04_make_atc_phewas_phenotypes.sh
    ├── merge_chembl_synonyms_bnf.sh
    ├── merge_chembl_synonyms_atc.sh
    ├── merge_chembl_synonyms_ukbb.sh
    ├── atc_words_synonyms.sh
    ├── 03_get_ukbb_srmed_data_atc.sh
    ├── 03_get_ukbb_srmed_data_bnf.sh
    ├── get_molecule_synonyms_sqlite.sh
    ├── 01_bnf_prepare_sqlite.sh
    ├── bnf_words_synonyms.sh
    ├── preprocess_atc_data.sh
    ├── preprocess_bnf_data.sh
    ├── 01_atc_prepare_sqlite.sh
    ├── 02_bnf_match.sh
    ├── 02_atc_match.sh
    ├── bnf_post_process_match_data.sh
    └── atc_post_process_match_data.sh
├── py
    ├── bnf_parse.py
    ├── atc_parse.py
    ├── pheno
    │   ├── transpose_pheno_data.py
    │   ├── normalise_bnf_codes.py
    │   ├── generate_bnf_medication_annotations.py
    │   ├── assign_codes_to_participant_data.py
    │   ├── generate_medication_phenotypes.py
    │   └── cut_main_csv_file.py
    ├── list_excl_words.py
    ├── chembl
    │   ├── get_atc_data_with_molregno_sqlite.py
    │   ├── dump_sqlite_table_data.py
    │   ├── generate_atc_medication_annotations_sqlite.py
    │   └── generate_atc_medication_annotations.py
    ├── generate_syn_dictionary.py
    ├── parse_chembl_synonyms.py
    ├── append_ukb_counts.py
    ├── merge_chembl_synonyms.py
    ├── code_data_match.py
    └── datahelper.py
├── env
    └── common_tplt
├── CODE_OF_CONDUCT.md
├── README.md
├── LICENSE
└── data
    └── atc_unmatched_list.csv


/images/ukbb_srmed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PhilAppleby/ukbb-srmed/HEAD/images/ukbb_srmed.png


--------------------------------------------------------------------------------
/sh/chembl/get_chembl_atc_classification_data_sqlite.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | source ${UKBPROJROOT}/env/common
3 | # Join atc_classification and atc_molecule_classication
4 | #
5 | python ${PYDIR}/chembl/get_atc_data_with_molregno_sqlite.py > \
6 | 	${CDATADIR}/atc_classification_molregno.tsv 
7 | 


--------------------------------------------------------------------------------
/sh/04_make_bnf_phewas_phenotypes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | #
 4 | # Generate UKB medication data phenotypes
 5 | #
 6 | echo "Step 11 - Generate BNF phenotypes"
 7 | time ${SHDIR}/pheno/make_ukb_phenotypes_bnf.sh
 8 | echo "Step 12 - Generate BNF phenotypes, version 2"
 9 | time ${SHDIR}/pheno/normalise_bnf_codes.sh
10 | 


--------------------------------------------------------------------------------
/sh/pheno/medication_pheno_transpose.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | source ${UKBPROJROOT}/env/common
3 | #
4 | # Wrap the pheno/transpose_pheno_data.sh script, supplying parameters
5 | # for medication data
6 | #
7 | echo "Step 9 - Transpose medication phenotype data, eliminating empty cells"
8 | time ${SHDIR}/pheno/transpose_pheno_data.sh reported_medication 20003
9 | 


--------------------------------------------------------------------------------
/sh/04_make_atc_phewas_phenotypes.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | #
 4 | # Generate UKB medication data phenotypes
 5 | #
 6 | echo "Step 11a - Generate level2 phenotypes"
 7 | time ${SHDIR}/pheno/make_ukb_phenotypes_atc_level2.sh
 8 | echo "Step 11b - Generate level3 phenotypes"
 9 | time ${SHDIR}/pheno/make_ukb_phenotypes_atc_level3.sh
10 | 


--------------------------------------------------------------------------------
/sh/merge_chembl_synonyms_bnf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | source ${UKBPROJROOT}/env/common
3 | # Merge bulk BNF data (ownership uncertain, possibly HIC) with 
4 | # synonyms from the CHEMBL database
5 | #
6 | cat ${BNFDATADIR}/bnf_combined.csv | \
7 |   python ${PYDIR}/merge_chembl_synonyms.py --synfile=${CDATADIR}/syn_dict_all.txt > \
8 |   ${BNFDATADIR}/bnf_combined_synonyms.csv 
9 | 


--------------------------------------------------------------------------------
/sh/pheno/medication_pheno_extract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | #
 4 | # Wrap the pheno/cut_pheno_cols.sh script, supplying parameters
 5 | # for medication data - US only as that's where the main
 6 | # UKB csv file resides
 7 | #
 8 | echo "Step 8 - Extract medication phenotype from the main UKBB file"
 9 | time ${SHDIR}/pheno/cut_pheno_cols.sh reported_medication 20003
10 | 


--------------------------------------------------------------------------------
/sh/merge_chembl_synonyms_atc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # 
 4 | # Merge (attach) chembl synonyms with atc classification data - both originally 
 5 | # extracted from the CHEMBL database
 6 | # 
 7 | #
 8 | cat ${ATCDATADIR}/atc_who_desc.csv | \
 9 |   python ${PYDIR}/merge_chembl_synonyms.py --synfile=${CDATADIR}/syn_dict_all.txt > \
10 |   ${ATCDATADIR}/atc_who_desc_synonyms.csv 
11 | 


--------------------------------------------------------------------------------
/sh/pheno/normalise_bnf_codes.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | source ${UKBPROJROOT}/env/common
3 | echo '.1 cat BNF coded for conversion' ${UKBPDIR}
4 | cat ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv \
5 | 	| python ${PYDIR}/pheno/normalise_bnf_codes.py \
6 | 	> ${UKBPDIR}/bnf_chapter_section_subsection_counts.csv
7 | #------------------------------------------------------------------------------------------------
8 | echo 'END'
9 | 


--------------------------------------------------------------------------------
/sh/merge_chembl_synonyms_ukbb.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # 
 4 | # Merge chembl synonyms with UKBB medication coding data: chembl synoyms 
 5 | # from the CHEMBL database, UKBB coding data from the UK Biobank data showcase
 6 | # 
 7 | 
 8 | cat ${UDATADIR}/medication_coding.csv | \
 9 | 	python ${PYDIR}/merge_chembl_synonyms.py --synfile=${CDATADIR}/syn_dict_all.txt > \
10 | 	${UDATADIR}/medication_coding_synonyms.csv 
11 | 


--------------------------------------------------------------------------------
/sh/atc_words_synonyms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | #
 4 | #   CHEMBL synonyms have been attached to both ATC classification data
 5 | #   and UKB coding data 
 6 | #   This is the main match process - the result file contains code matches
 7 | # 
 8 | #
 9 | cat ${UDATADIR}/medication_coding_synonyms.csv | \
10 | 	python ${PYDIR}/code_data_match.py --clsfile=${ATCDATADIR}/atc_who_desc_synonyms.csv > \
11 | 	${ATCDATADIR}/results/atc_res.csv
12 | 


--------------------------------------------------------------------------------
/sh/03_get_ukbb_srmed_data_atc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # Extract UKBB SR medication data
 4 | #
 5 | echo "Step 8 - Extract phenotype information from the UKBB main phenotype file"
 6 | time ${SHDIR}/pheno/medication_pheno_extract.sh
 7 | echo "Step 9 - Transpose extracted phenotype data"
 8 | time ${SHDIR}/pheno/medication_pheno_transpose.sh
 9 | echo "Step 10 - Prepare ATC data for phenotype generation"
10 | time ${SHDIR}/pheno/make_ukb_phenotypes_atc_prep.sh
11 | 


--------------------------------------------------------------------------------
/sh/03_get_ukbb_srmed_data_bnf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # Extract UKBB SR medication data
 4 | #
 5 | echo "Step 8 - Extract phenotype information from the UKBB main phenotype file"
 6 | time ${SHDIR}/pheno/medication_pheno_extract.sh
 7 | echo "Step 9 - Transpose extracted phenotype data"
 8 | time ${SHDIR}/pheno/medication_pheno_transpose.sh
 9 | echo "Step 10 - Prepare BNF data for phenotype generation"
10 | time ${SHDIR}/pheno/make_ukb_phenotypes_bnf_prep.sh
11 | 


--------------------------------------------------------------------------------
/sh/get_molecule_synonyms_sqlite.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # Extract synonyms from the chembl molecule_synonyms table and "flatten" them
 4 | # into one record prior to synonym dictionary generation, which expands
 5 | # to one record per synonym in the data file output by the pipeline
 6 | #
 7 | python ${PYDIR}/chembl/dump_sqlite_table_data.py --tablename=molecule_synonyms | \
 8 | 	sort -k1,1 -n | \
 9 | 	python ${PYDIR}/parse_chembl_synonyms.py | \
10 | 	python ${PYDIR}/generate_syn_dictionary.py > ${CDATADIR}/syn_dict_all.txt
11 | 


--------------------------------------------------------------------------------
/sh/pheno/cut_pheno_cols.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #source ${UKBPROJROOT}/env/common
 3 | # Only runnable against the single read-only copy of the UKB Phenotype data
 4 | #
 5 | # must supply arg1, arg2 only (descriptive name and column prefix)
 6 | if [[ $# -ne 2 ]] ; then
 7 |     echo 'must supply arg1, arg2 only (descriptive name and column prefix)'
 8 |     exit 1
 9 | fi
10 | echo ${UKBBPHENODIR}/${UKBBPHENOFILE}
11 | echo "Cut cols: write to ${UKBPDIR} (ukb_${2}_${1}.csv)"
12 | python ${PYDIR}/pheno/cut_main_csv_file.py --csvfile=${UKBBPHENODIR}/${UKBBPHENOFILE} --colprefs=${2} > ${UKBPDIR}/ukb_${2}_${1}.csv
13 | 


--------------------------------------------------------------------------------
/sh/pheno/transpose_pheno_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #source ${UKBPROJROOT}/env/common
 3 | # Take a multi column file extraced for a phenotype from 
 4 | # the main UKB csv file and output one record per participant
 5 | # per phenotype code
 6 | #
 7 | # must supply arg1, arg2 only (descriptive name and column prefix)
 8 | if [[ $# -ne 2 ]] ; then
 9 |     echo 'must supply arg1, arg2 only (descriptive name and column prefix)'
10 |     exit 1
11 | fi
12 | echo "Transp: write to ${UKBPDIR}"
13 | 
14 | cat  ${UKBPDIR}/ukb_${2}_${1}.csv | \
15 |   python ${PYDIR}/pheno/transpose_pheno_data.py > ${UKBPDIR}/ukb_${2}_${1}_n.csv
16 | 


--------------------------------------------------------------------------------
/sh/01_bnf_prepare_sqlite.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | #
 4 | # Data preparation- get BNF classification data and generate synonyms
 5 | #
 6 | # NOTE: No step 1 here, BNF files must be supplied - this depends
 7 | # on local ownership of BNF data
 8 | #
 9 | echo "Step 2 - Preprocess BNF classification data"
10 | time ${SHDIR}/preprocess_bnf_data.sh
11 | echo "Step 3 - Get and reorganise CHEMBL molecule synonyms"
12 | time ${SHDIR}/get_molecule_synonyms_sqlite.sh
13 | 
14 | # Main file artefacts at the end of each step of this group of steps:
15 | 
16 | # 2) ${BNFDATADIR}/bnf_combined.csv
17 | #
18 | # 3) ${CDATADIR}/syn_dict_all.txt
19 | 


--------------------------------------------------------------------------------
/sh/bnf_words_synonyms.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # Match words in medication_coding.tsv (The UKBB site has tis as coding4.tsv - download link
 4 | # was http://biobank.ctsu.ox.ac.uk/showcase/coding.cgi?id=4 as at 20171019
 5 | #
 6 | # bnf_combined.csv is a mashup of words from both the chemical names file and from HIC's bnf file
 7 | # CAN WE USE THIS
 8 | # Advantages are that it has proprietary names and chem name
 9 | #
10 | cat ${UDATADIR}/medication_coding_synonyms.csv | python ${PYDIR}/code_data_match.py \
11 | 	--clsfile=${BNFDATADIR}/bnf_combined_synonyms.csv \
12 | 	--multioutput=N \
13 | 	> ${BNFDATADIR}/results/bnf_res.csv
14 | 


--------------------------------------------------------------------------------
/sh/preprocess_atc_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | #
 4 | # Preprocess the ATC classification data to cut the relevant columns
 5 | # Executed twice to capture both molregno and description
 6 | # Then to trim the ATC code to the required length (level3)
 7 | #
 8 | awk -F '\t' '{print $2 "\t" $3}' ${CDATADIR}/atc_classification_molregno.tsv | \
 9 | 	grep -v None > ${CDATADIR}/atc_who_desc_fullcode.tsv
10 | awk -F '\t' '{print $2 "\t" $1}' ${CDATADIR}/atc_classification_molregno.tsv >> \
11 | 	${CDATADIR}/atc_who_desc_fullcode.tsv
12 | cat ${CDATADIR}/atc_who_desc_fullcode.tsv | \
13 | 	python ${PYDIR}/atc_parse.py --codelen=4 > ${ATCDATADIR}/atc_who_desc.csv
14 | 


--------------------------------------------------------------------------------
/sh/preprocess_bnf_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # Preprocess BNF files to cut the relevant columns, normalise
 4 | # coding and convert all text to lower case
 5 | #
 6 | # Local data Code + long description
 7 | cut -f 1,3 -d ',' ${BNFDATADIR}/bnf_data_formatted.csv  > ${BNFDATADIR}/bnf_data_1.csv
 8 | # Local data Code + approved_name
 9 | cut -f 1,4 -d ',' ${BNFDATADIR}/bnf_data_formatted.csv | sed '1d' > ${BNFDATADIR}/bnf_data_2.csv
10 | 
11 | # Cat the 2 result files from above, format the bnf code and convert all text to lower case
12 | cat ${BNFDATADIR}/bnf_data_1.csv ${BNFDATADIR}/bnf_data_2.csv | python ${PYDIR}/bnf_parse.py > ${BNFDATADIR}/bnf_combined.csv 
13 | 


--------------------------------------------------------------------------------
/py/bnf_parse.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Transform BNF code to formatted version
 3 | #
 4 | import time
 5 | import datetime
 6 | # import re
 7 | import os, sys
 8 | from datahelper import Datahelper
 9 | 
10 | def main():
11 |   count = 0
12 |   dh = Datahelper()
13 |   hdr = sys.stdin.readline().strip()
14 |   print(hdr)
15 | 
16 |   for line in sys.stdin:
17 |     count += 1
18 |     data = line.strip().split(',')
19 |     data[0] = dh.format_digit_code(data[0], 3)
20 |     data[1] = data[1].lower()
21 |     print(','.join(data))
22 | 
23 |   return count 
24 | 
25 | # execution flow starts here
26 | #
27 | start_time = time.time()
28 | 
29 | count = main()
30 | #print "END:", time.time() - start_time, "seconds", count
31 | 
32 | 


--------------------------------------------------------------------------------
/sh/01_atc_prepare_sqlite.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | #
 4 | # Data preparation- get ATC classification data and generate synonyms
 5 | #
 6 | echo "Step 1 - Get ATC classification data from the CHEMBL Db"
 7 | time ${SHDIR}/chembl/get_chembl_atc_classification_data_sqlite.sh
 8 | echo "Step 2 - Preprocess ATC classification data"
 9 | time ${SHDIR}/preprocess_atc_data.sh
10 | echo "Step 3 - Get and reorganise CHEMBL molecule synonyms"
11 | time ${SHDIR}/get_molecule_synonyms_sqlite.sh
12 | 
13 | # Main file artefacts at the end of each step of this group of steps:
14 | 
15 | # 1) ${CDATADIR}/atc_classification_molregno.tsv
16 | #
17 | # 2) ${ATCDATADIR}/atc_who_desc.csv
18 | #
19 | # 3) ${CDATADIR}/syn_dict_all.txt
20 | 


--------------------------------------------------------------------------------
/py/atc_parse.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Trim ATC code to formatted version
 3 | # Calls d function in the datahelper class
 4 | #
 5 | import time
 6 | import datetime
 7 | import os, sys
 8 | from datahelper import Datahelper
 9 | from optparse import OptionParser
10 | 
11 | def main(options):
12 |   dh = Datahelper()
13 | 
14 |   for line in sys.stdin:
15 |     data = line.strip().split('\t')
16 |     data[0] = dh.format_atc_code(data[0], int(options.codelen))
17 |     data[1] = dh.get_normalised_phrase(data[1]).lower()
18 |     print(','.join(data))
19 | # End main()
20 | 
21 | # execution flow starts here
22 | #
23 | start_time = time.time()
24 | 
25 | parser = OptionParser()
26 | parser.add_option("-l", "--codelen", dest="codelen",
27 |   help="ATC code length", metavar="INT")
28 | (options, args) = parser.parse_args()
29 | 
30 | main(options)
31 | 
32 | 


--------------------------------------------------------------------------------
/py/pheno/transpose_pheno_data.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # print records for all participants for all codes found in the input (eliminates blank records)
 3 | #
 4 | # NOTE: Assumes relevant data starts in the second column (1) - (eid) is the first column (0).
 5 | # 
 6 | import time
 7 | import datetime
 8 | import re
 9 | import os, sys
10 | import random
11 | import json
12 | 
13 | def main():
14 | 
15 |   codes = []
16 |   count = 0
17 |   hdr = sys.stdin.readline()
18 |   print("eid,code")
19 | 
20 |   for line in sys.stdin:
21 |     data = line.strip().split(',')
22 |     for elem in data[1:]:
23 |       if elem != "":
24 |         print("{0},{1}".format(data[0], elem))
25 | 
26 |   return count 
27 | 
28 | 
29 | # execution flow starts here
30 | #
31 | start_time = time.time()
32 | 
33 | count = main()
34 | #print "END:", time.time() - start_time, "seconds", count
35 | 
36 | 


--------------------------------------------------------------------------------
/env/common_tplt:
--------------------------------------------------------------------------------
 1 | export UKBBDATADIR=<PATH to main project-dependent UKBB data files>
 2 | export ATCDATADIR=${PROJDATA}/atc
 3 | export BNFDATADIR=${PROJDATA}/bnf
 4 | export BNFCODEDIR=${PROJDATA}/GoDARTS2/lookups
 5 | export UKBPDIR=${PROJDATA}/ukb_pheno
 6 | export UKBBPHENOFILE=<project-dependent UKBB phenotype file name, of the form ukbnnnnnnn.csv>
 7 | export UDATADIR=${PROJDATA}/ukb
 8 | export CDATADIR=${PROJDATA}/chembl
 9 | export PYDIR=${UKBPROJROOT}/py
10 | export PYTHONPATH=${PYDIR}:${PYTHONPATH}
11 | export SHDIR=${UKBPROJROOT}/sh
12 | export LOCBINDIR=${HOME}/local/bin
13 | # Note that this is for sqlite3 access to ChEMBL data
14 | export CHFILE=${PROJDATA}/chembl_23/chembl_23.db
15 | # Fill in the following if using MySQL (other DBMS's may require different parameters)
16 | export CHHOST=
17 | export CHPORT=
18 | export CHDB=chembldb23
19 | export CHUSER=
20 | export CHPWD=
21 | 


--------------------------------------------------------------------------------
/py/pheno/normalise_bnf_codes.py:
--------------------------------------------------------------------------------
 1 | # Load BNF codes and match them in the chapter.section count file
 2 | # 
 3 | import time
 4 | import datetime
 5 | import re
 6 | import os, sys
 7 | 
 8 | start_time = time.time()
 9 | 
10 | def main():
11 | 
12 |   print("id,icd9,count")
13 |   bnf_col = 2
14 |   
15 |   for line in sys.stdin:
16 |     data = line.strip().split(",")
17 |     bnf_code = ""
18 |     bnf_array = data[bnf_col].split(".")
19 |     if bnf_array[0] == "DU":
20 |       continue
21 |     if bnf_array[0] == "NULL":
22 |       continue
23 |     if len(bnf_array) == 1:
24 |       bnf_code = "{0:02d}".format(int(bnf_array[0]))
25 |     elif len(bnf_array) == 2:
26 |       bnf_code = "{0:02d}{1:02d}".format(int(bnf_array[0]), int(bnf_array[1]))
27 |     elif len(bnf_array) == 3:
28 |       bnf_code = "{0:02d}{1:02d}{2:02d}".format(int(bnf_array[0]), int(bnf_array[1]), int(bnf_array[2]))
29 |     print("{},{},{}".format(data[0], bnf_code, 10))
30 |   return
31 | 
32 | 
33 | # execution flow starts here
34 | #
35 | main()
36 | #print "END:", time.time() - start_time, "seconds", rec_count
37 | 
38 | 


--------------------------------------------------------------------------------
/py/list_excl_words.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | import time
 3 | import datetime
 4 | import re
 5 | import string
 6 | import os, sys
 7 | from optparse import OptionParser
 8 | from datahelper import Datahelper
 9 | 
10 | def main():
11 | 
12 |   try:
13 |     dh = Datahelper()
14 |   except IOError as e:
15 |     print("I/O error({0}): {1}".format(e.errno, e.strerror))
16 |     print("I/O error:", sys.exc_info())
17 |     exit()
18 |   except TypeError as e:
19 |     print("Missing arguments ", e)
20 |     exit()
21 |   except:
22 |     #print "Unexpected error:", sys.exc_info()[0]
23 |     print("Unexpected error:", sys.exc_info())
24 |     exit()
25 | 
26 |   ewords = dh.get_excluded_words()
27 | 
28 |   count = 0
29 |   outarray = []
30 |   for wd in ewords:
31 |     count += 1
32 |     outarray.append(wd)
33 |     if count % 4 == 0:
34 |       print(",".join(outarray))
35 |       outarray = []
36 |   if len(outarray) > 0:
37 |     print(",".join(outarray))
38 |   #print count
39 | 
40 |   return 
41 | 
42 | 
43 | # execution flow starts here
44 | #
45 | main()
46 | #print "END:", time.time() - start_time, "seconds", count, match_count, miss_count
47 | 
48 | 


--------------------------------------------------------------------------------
/sh/02_bnf_match.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # The Match steps:
 4 | # BNF and UKBB data synonym merging,
 5 | #
 6 | # All BNF codes and descriptions:
 7 | # UKBB side - has synonyms
 8 | # BNF side - has synonyms
 9 | #
10 | #
11 | echo "Step 4 - Merge CHEMBL molecule synonyms and BNF classification data"
12 | time ${SHDIR}/merge_chembl_synonyms_bnf.sh
13 | echo "Step 5 - Merge CHEMBL molecule synonyms and UKB self-reported medication data"
14 | time ${SHDIR}/merge_chembl_synonyms_ukbb.sh
15 | echo "Step 6 - The main matching step - attempt to assign BNF codes to UKBB data"
16 | time ${SHDIR}/bnf_words_synonyms.sh
17 | echo "Step 7 - Post process match / mismatch data to format for manual intervention and phenotype generation"
18 | time ${SHDIR}/bnf_post_process_match_data.sh
19 | 
20 | # Main output files at the end of each step:
21 | 
22 | # 4) ${BNFDATADIR}/bnf_combined_synonyms.csv
23 | 
24 | # 5) ${UDATADIR}/medication_coding_synonyms.csv
25 | 
26 | # 6) ${BNFDATADIR}/results/bnf_res.csv
27 | 
28 | # 7) ${BNFDATADIR}/results/bnf_matched.csv and ${BNFDATADIR}/results/bnf_missing.csv (which is then used to assign manual matches) 
29 | 


--------------------------------------------------------------------------------
/sh/02_atc_match.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # The Match steps:
 4 | # ATC and UKBB data synonym merging,
 5 | #
 6 | # All ATC codes and descriptions:
 7 | # UKBB side - has synonyms
 8 | # ATC side - has synonyms
 9 | #
10 | #
11 | echo "Step 4 - Merge CHEMBL molecule synonyms and ATC classification data"
12 | time ${SHDIR}/merge_chembl_synonyms_atc.sh
13 | echo "Step 5 - Merge CHEMBL molecule synonyms and UKB self-reported medication data"
14 | time ${SHDIR}/merge_chembl_synonyms_ukbb.sh
15 | echo "Step 6 - The main matching step - attempt to assign ATC level3 codes to UKBB data"
16 | time ${SHDIR}/atc_words_synonyms.sh
17 | echo "Step 7 - Post process match / mismatch data to format for manual intervention and phenotype generation"
18 | time ${SHDIR}/atc_post_process_match_data.sh
19 | 
20 | # Main output files at the end of each step:
21 | 
22 | # 4) ${ATCDATADIR}/atc_who_desc_synonyms.csv
23 | 
24 | # 5) ${UDATADIR}/medication_coding_synonyms.csv
25 | 
26 | # 6) ${ATCDATADIR}/results/atc_res.csv
27 | 
28 | # 7) ${ATCDATADIR}/results/atc_matched.csv and ${ATCDATADIR}/results/atc_missing.csv (which is then used to assign manual matches) 
29 | 


--------------------------------------------------------------------------------
/py/chembl/get_atc_data_with_molregno_sqlite.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import sqlite3
 3 | import os, sys
 4 | from optparse import OptionParser
 5 | #
 6 | # Access the publicly available CHEMBL database (local copy) to
 7 | # obtain ATC (WHO) classification data
 8 | #
 9 | #
10 | def main():
11 |   """
12 |   Get atc data for use in classifying UKBB coded medication data
13 |   Requires a join of the CHEMBL atc_classification and molecule_atc_classification tables
14 |   """
15 |   try:
16 |     chembl = sqlite3.connect(os.environ["CHFILE"])
17 |     chembl.text_factory = str
18 |   except:
19 |     print("Unexpected error:", sys.exc_info())
20 |     exit()
21 | 
22 |   query = """select atc.who_name, atc.level5, matc.molregno, atc.level4_description 
23 |              from atc_classification atc 
24 |              LEFT JOIN molecule_atc_classification matc 
25 |              ON atc.level5 = matc.level5"""
26 |   
27 |   cursor = chembl.cursor()
28 |   cursor.execute(query)
29 |   for row in cursor:
30 |     print('\t'.join([str(elem) for elem in row]))
31 | 
32 |   chembl.close()
33 |   # main() ends
34 | 
35 | 
36 | # execution flow starts here
37 | #
38 | start_time = time.time()
39 | 
40 | main()
41 | 


--------------------------------------------------------------------------------
/sh/pheno/make_ukb_phenotypes_bnf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | echo '.1 sort BNF coded data to' ${UKBPDIR}
 4 | cat ${UKBPDIR}/ukb_20003_with_bnf_codes_matched.csv | \
 5 |   sort -u  > ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv
 6 | 
 7 | # step 2 get the list of possible phenotypes
 8 | echo '.2 extract list of possible phenotype to' ${UKBPDIR}
 9 | cut -f 3 -d ',' ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv | \
10 |   sort -u > ${UKBPDIR}/ukb_possible_med_phenotypes_bnf.csv
11 | 
12 | # step 3 generate phenotype annotations, get data from the BNF description file
13 | echo '.3 Get data from bnf desc file, write to' ${UKBPDIR}
14 | cat ${UKBPDIR}/ukb_possible_med_phenotypes_bnf.csv | \
15 |   python ${PYDIR}/pheno/generate_bnf_medication_annotations.py \
16 |   --bnfcodes=${BNFCODEDIR}/bnf_listing.txt > \
17 |   ${UKBPDIR}/Anno_medications_BIN_bnf.csv
18 | 
19 | # step 4 generate phenotypes
20 | echo '.4 write PheWAS phenos to' ${UKBPDIR}
21 | cat ${UKBPDIR}/ukb_20003_with_bnf_codes_matched_sorted.csv | \
22 |   python ${PYDIR}/pheno/generate_medication_phenotypes.py \
23 |   --pfile=${UKBPDIR}/Anno_medications_BIN_bnf.csv > \
24 |   ${UKBPDIR}/med_phenotypes_bnf.tsv
25 | #------------------------------------------------------------------------------------------------
26 | echo 'END'
27 | 


--------------------------------------------------------------------------------
/py/pheno/generate_bnf_medication_annotations.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import os, sys
 3 | from datahelper import Datahelper
 4 | from optparse import OptionParser
 5 | 
 6 | def load_code_list(fh):
 7 |   lookup = {}
 8 |   for line in fh:
 9 |     codedata = line.strip().split("|")
10 |     lookup[codedata[0]] = codedata[1]
11 |  
12 |   return lookup
13 | 
14 | def main(options):
15 |   """
16 |   """
17 | 
18 |   dh = Datahelper()
19 |   try:
20 |     fh = open(options.bnfcodes, "r")
21 |     code_lookup = load_code_list(fh)
22 |   except:
23 |     print("Unexpected error:", sys.exc_info())
24 |     exit()
25 | 
26 |   print("pheno,PHENOTYPE,Category,type")
27 | 
28 |   count = 0
29 |   for line in sys.stdin:
30 |     data = line.strip().split(',')
31 |     count = 0
32 |   
33 |     if data[0] in code_lookup:
34 |       count += 1
35 |       pheno_string = dh.get_normalised_phrase(code_lookup[data[0]])
36 |       pheno_string = dh.make_pheno_string(pheno_string)
37 |       #data.append(data[0] + "_" + pheno_string)
38 |       data.append(data[0])
39 |       data.append(pheno_string)
40 |       data.append("BINARY")
41 |       print(','.join(data))
42 | 
43 |   return count
44 | 
45 | # execution flow starts here
46 | #
47 | start_time = time.time()
48 | parser = OptionParser()
49 | parser.add_option("-b", "--bnfcodes", dest="bnfcodes",
50 |   help="bnfcodes", metavar="FILE")
51 | 
52 | (options, args) = parser.parse_args()
53 | 
54 | rcount = main(options)
55 | #print "END:", time.time() - start_time, "seconds", rcount
56 | 


--------------------------------------------------------------------------------
/sh/pheno/make_ukb_phenotypes_bnf_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # 
 4 | # Just get the matches from the manually edited file
 5 | #egrep -vw "UKBB_code|NA" ${BNFDATADIR}/results/bnf_all_manual_matches.csv > \
 6 | #  ${BNFDATADIR}/results/bnf_manual_matches.csv
 7 | # Cut relevant columns
 8 | #cut -f 1,2,5 -d ',' ${BNFDATADIR}/results/bnf_manual_matches.csv > \
 9 | #  ${BNFDATADIR}/results/bnf_manual_matches_cut.csv
10 | #cut -f 1,2,6 -d ',' ${BNFDATADIR}/results/bnf_matched.csv > \
11 | #  ${BNFDATADIR}/results/bnf_all_matches.csv
12 | 
13 | # Combine auto and manually matched codes
14 | #cat ${BNFDATADIR}/results/bnf_auto_matches_cut.csv \
15 | #  ${BNFDATADIR}/results/bnf_manual_matches_cut.csv > \
16 | #  ${BNFDATADIR}/results/bnf_all_matches.csv 
17 | 
18 | # Assign codes where possible to all items in the reported medication list
19 | cat ${UKBPDIR}/ukb_20003_reported_medication_n.csv | \
20 |   python ${PYDIR}/pheno/assign_codes_to_participant_data.py \
21 |     --codefile=${BNFDATADIR}/results/bnf_all_matches.csv > \
22 |   ${UKBPDIR}/ukb_20003_with_bnf_codes.csv
23 | 
24 | # Get list of unmatched participant medication data
25 | grep -w NA ${UKBPDIR}/ukb_20003_with_bnf_codes.csv > \
26 |   ${UKBPDIR}/ukb_20003_with_bnf_codes_unmatched.csv
27 | 
28 | # Get list of matched participant medication data (without a header) this
29 | # will feed into phenotype generation
30 | grep -wv NA ${UKBPDIR}/ukb_20003_with_bnf_codes.csv | sed '1,1d' > \
31 |   ${UKBPDIR}/ukb_20003_with_bnf_codes_matched.csv
32 | 
33 | 


--------------------------------------------------------------------------------
/py/generate_syn_dictionary.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Third step in the chembl synonym generation pipeline:
 3 | # python ${PYDIR}/chembl/dump_mysql_table_data.py --tablename=molecule_synonyms | \
 4 | #   sort -k1,1 -n | \
 5 | #   python ${PYDIR}/parse_chembl_synonyms.py | \
 6 | #   python ${PYDIR}/generate_syn_dictionary.py > ${CDATADIR}/syn_dict_all.txt
 7 | # 
 8 | import time
 9 | import datetime
10 | import re
11 | import os, sys
12 | import random
13 | import json
14 | 
15 | def main():
16 |   """
17 |   For each synonym set of size n (record) in the input: generate n records in
18 |   which each synonym is the key.
19 | 
20 |   But special cases need handling:
21 |   Achieved by building an internal dictionary which allows for synonyms apprearing in more
22 |   than 1 input record (In CHEMBL terms allows for > 1 molregno having the same synonym in its
23 |   synonym set)
24 |   """
25 |   count = 0
26 |   synonyms = {}
27 | 
28 |   for line in sys.stdin:
29 |     data = line.strip().split(',')
30 |     type_syns = data[0].split('|')
31 |     syns = [x.split(':')[1].strip() for x in type_syns]
32 |     for syn in syns:
33 |       if syn not in synonyms:
34 |         synonyms[syn] = [] 
35 |       for other_syn in syns:
36 |         if other_syn not in synonyms[syn]:
37 |           synonyms[syn].append(other_syn)
38 | 
39 |   for syn in sorted(synonyms):
40 |     print("{0}\t{1}".format(syn, '|'.join(set(synonyms[syn]))))
41 |     count += 1
42 |   return count
43 | 
44 | # execution flow starts here
45 | #
46 | start_time = time.time()
47 | count = main()
48 | 
49 | 


--------------------------------------------------------------------------------
/py/parse_chembl_synonyms.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Second step in the chembl synonym generation pipeline:
 3 | # python ${PYDIR}/chembl/dump_mysql_table_data.py --tablename=molecule_synonyms | \
 4 | #   sort -k1,1 -n | \
 5 | #   python ${PYDIR}/parse_chembl_synonyms.py | \
 6 | #   python ${PYDIR}/generate_syn_dictionary.py > ${CDATADIR}/syn_dict_all.txt
 7 | #
 8 | # 
 9 | import time
10 | import datetime
11 | import re
12 | import os, sys
13 | import random
14 | import json
15 | from datahelper import Datahelper
16 | 
17 | def main():
18 |   """
19 |   Requires the input to be sorted on the first field (the numeric molregno).
20 |   One record per molregno is output.
21 | 
22 |   Calls a datahelper function to normalise each word or phrase (convert to 
23 |   lower case and remove special characters)
24 |   """
25 |   count = 0
26 |   last_molno = ""
27 |   related_synonyms = []
28 |   dh = Datahelper()
29 | 
30 |   for line in sys.stdin:
31 |     data = line.strip().split('\t')
32 |     if data[0] != last_molno and last_molno != "":
33 |       print('|'.join(related_synonyms) + "|MOLREGNO:" + last_molno)
34 |       related_synonyms = []
35 |     last_molno = data[0]
36 |     text = dh.get_normalised_phrase(data[1])
37 |     stype = data[2]
38 |     syn = stype + ":" + text
39 |     if syn not in related_synonyms:
40 |       related_synonyms.append(syn)
41 | 
42 |   # output the last synonym group
43 |   print('|'.join(related_synonyms) + "|MOLREGNO:" + last_molno)
44 | 
45 | # execution flow starts here
46 | #
47 | start_time = time.time()
48 | 
49 | count = main()
50 | 
51 | 


--------------------------------------------------------------------------------
/py/append_ukb_counts.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Append UKBB self-report report counts to a file
 3 | # with bnf description as col 2
 4 | # 
 5 | import time
 6 | import datetime
 7 | import re
 8 | import os, sys
 9 | import random
10 | import json
11 | from optparse import OptionParser
12 | 
13 | def load_count_data(fh):
14 |   ukbb_counts = {}
15 | 
16 |   for line in fh:
17 |     data = line.strip().split(',')
18 |     ukbb_counts[data[0]] = data[1]
19 | 
20 |   return ukbb_counts
21 | 
22 | def main(options):
23 |   count = 0
24 |   last_molno = ""
25 |   related_synonyms = []
26 | 
27 |   # try to load the UK counts file
28 |   try:
29 |     fh = open(options.ukbcfile, "r")
30 |     ukbb_counts = load_count_data(fh)
31 |   except IOError as e:
32 |     print("I/O error({0}): {1}".format(e.errno, e.strerror))
33 |     print("I/O error:", sys.exc_info())
34 |     exit()
35 |   except TypeError as e:
36 |     print("Missing arguments ", e)
37 |     exit()
38 |   except:
39 |     #print "Unexpected error:", sys.exc_info()[0]
40 |     print("Unexpected error:", sys.exc_info())
41 |     exit()
42 | 
43 |   for line in sys.stdin:
44 |     data = line.strip().split(',')
45 |     count = '0'
46 |     if data[1] in ukbb_counts:
47 |       count = ukbb_counts[data[1]]
48 |     data.append(count)
49 |     print(','.join(data))
50 | 
51 | # execution flow starts here
52 | #
53 | start_time = time.time()
54 | parser = OptionParser()
55 | #
56 | parser.add_option("-u", "--ukbcfile", dest="ukbcfile",
57 |   help="UKB count file", metavar="FILE")
58 | 
59 | (options, args) = parser.parse_args()
60 | 
61 | count = main(options)
62 | #print "END:", time.time() - start_time, "seconds", count
63 | 
64 | 


--------------------------------------------------------------------------------
/py/chembl/dump_sqlite_table_data.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import sqlite3
 3 | import os, sys
 4 | from optparse import OptionParser
 5 | 
 6 | def main(options):
 7 |   """
 8 |   Dump out the data for any sqllite table in the chembl db as a tab-separated variable file 
 9 |   Accepts an optional (simple) where clause and an optional row limit.
10 |   """
11 |   try:
12 |     chembl = sqlite3.connect(os.environ["CHFILE"])
13 |     chembl.text_factory = str
14 |   except:
15 |     #print "Unexpected error:", sys.exc_info()[0]
16 |     print("Unexpected error:", sys.exc_info())
17 |     exit()
18 |   
19 |   query = "select * from {0}".format(options.tablename)
20 | 
21 |   if options.where_clause != None:
22 |     query += " {0}".format(options.where_clause)
23 |   
24 |   if options.limit != None:
25 |     query += " limit {0}".format(options.limit)
26 | 
27 |   count = 0
28 |   
29 |   cursor = chembl.cursor()
30 |   cursor.execute(query)
31 |   for row in cursor:
32 |     count += 1
33 |     print('\t'.join([str(elem) for elem in row]))
34 | 
35 |   chembl.close()
36 | 
37 |   return count
38 | 
39 | # execution flow starts here
40 | #
41 | parser = OptionParser()
42 | 
43 | parser.add_option("-t", "--tablename", dest="tablename",
44 |   help="Table name", metavar="STR")
45 | 
46 | parser.add_option("-w", "--where_clause", dest="where_clause",
47 |   help="Optional where clause", metavar="STR")
48 | 
49 | parser.add_option("-l", "--limit", dest="limit",
50 |   help="Optional row limit (suggest 1 at test-time", metavar="STR")
51 | 
52 | start_time = time.time()
53 | (options, args) = parser.parse_args()
54 | 
55 | rcount = main(options)
56 | #print "END:", time.time() - start_time, "seconds", rcount
57 | 


--------------------------------------------------------------------------------
/sh/bnf_post_process_match_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # Post process the BNF results files
 4 | #
 5 | egrep -w "NA" ${BNFDATADIR}/results/bnf_res.csv > ${BNFDATADIR}/results/bnf_res_NA.csv
 6 | egrep -vw "AMB|NA" ${BNFDATADIR}/results/bnf_res.csv > ${BNFDATADIR}/results/bnf_res_NONA.csv
 7 | # add UKBB counts to matches and unmatches
 8 | cat ${BNFDATADIR}/results/bnf_res_NA.csv | python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > ${BNFDATADIR}/results/bnf_missing.csv
 9 | cat ${BNFDATADIR}/results/bnf_res_NONA.csv | python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > ${BNFDATADIR}/results/bnf_matched.csv
10 | # one_word matches are the most risky
11 | grep ":1," ${BNFDATADIR}/results/bnf_matched.csv > ${BNFDATADIR}/results/bnf_one_word_match_full.csv
12 | cut -f 1,2,3,5,6 -d ',' ${BNFDATADIR}/results/bnf_one_word_match_full.csv > ${BNFDATADIR}/results/one_word_match_list.csv
13 | 
14 | # all matched data
15 | cut -f 1,2,6 -d ',' ${BNFDATADIR}/results/bnf_matched.csv > ${BNFDATADIR}/results/bnf_matched_list.csv
16 | sort ${BNFDATADIR}/results/bnf_matched_list.csv > ${BNFDATADIR}/results/bnf_matched_list_sorted.csv
17 | # looking at missing data 
18 | cut -f 1,2,8 -d ',' ${BNFDATADIR}/results/bnf_missing.csv > ${BNFDATADIR}/results/bnf_unmatched_list.csv
19 | sort ${BNFDATADIR}/results/bnf_unmatched_list.csv > ${BNFDATADIR}/results/bnf_unmatched_list_sorted.csv
20 | # unique matched UKBB ids
21 | cut -f 1,2 -d ',' ${BNFDATADIR}/results/bnf_matched.csv | sort -u > ${BNFDATADIR}/results/bnf_matched_unique.csv
22 | cut -f 1,2 -d ',' ${BNFDATADIR}/results/bnf_matched.csv | sort | uniq -c | sort -nr > ${BNFDATADIR}/results/bnf_match_counts.csv
23 | 
24 | 


--------------------------------------------------------------------------------
/py/chembl/generate_atc_medication_annotations_sqlite.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import sqlite3
 3 | import os, sys
 4 | from datahelper import Datahelper
 5 | from optparse import OptionParser
 6 | 
 7 | def main(options):
 8 |   """
 9 |   Access the CHEMBL db for each input line and use the description
10 |   from the appropriate level
11 |   """
12 | 
13 |   level = int(options.level)
14 | 
15 |   dh = Datahelper()
16 |   try:
17 |     chembl = sqlite3.connect(os.environ["CHFILE"])
18 |     chembl.text_factory = str
19 |   except:
20 |     #print "Unexpected error:", sys.exc_info()[0]
21 |     print("Unexpected error:", sys.exc_info())
22 |     exit()
23 | 
24 |   print("pheno,PHENOTYPE,Category,type")
25 | 
26 |   count = 0
27 |   for line in sys.stdin:
28 |     data = line.strip().split(',')
29 |     atc_code = data[0]
30 |     query = "select level{0}_description from atc_classification where level{1} = '{2}' limit 1".format(level, level, atc_code)
31 |     count = 0
32 |   
33 |     cursor = chembl.cursor()
34 |     cursor.execute(query)
35 |     for row in cursor:
36 |       count += 1
37 |       pheno_string = dh.get_normalised_phrase(row[0])
38 |       pheno_string = dh.make_pheno_string(pheno_string)
39 |       data.append(data[0] + "_" + pheno_string)
40 |       data.append(pheno_string)
41 |       data.append("BINARY")
42 |       print(','.join(data))
43 | 
44 |   chembl.close()
45 |   return count
46 | 
47 | # execution flow starts here
48 | #
49 | start_time = time.time()
50 | parser = OptionParser()
51 | parser.add_option("-l", "--level", dest="level",
52 |   help="ATC level", metavar="INT")
53 | (options, args) = parser.parse_args()
54 | 
55 | rcount = main(options)
56 | #print "END:", time.time() - start_time, "seconds", rcount
57 | 


--------------------------------------------------------------------------------
/sh/pheno/make_ukb_phenotypes_atc_level3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # For level 3
 4 | # step 1 sort to eliminate duplicates - at this stage we have level3 codes, one row per
 5 | # participant per assigned ATC code
 6 | echo '.1 Write level3 codes to' ${UKBPDIR}
 7 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched.csv | \
 8 |   python ${PYDIR}/pheno/trim_atc_code.py --codelen=4 | \
 9 |   sort -u  > ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level3.csv
10 | 
11 | # step 2 get the list of possible phenotypes
12 | echo '.2 extract list of possible phenotype to' ${UKBPDIR}
13 | cut -f 3 -d ',' ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level3.csv | \
14 |   sort -u > ${UKBPDIR}/ukb_possible_med_phenotypes_level3.csv
15 | 
16 | # step 3 generate phenotype annotations
17 | echo '.3 Get data from sqlite db, write to' ${UKBPDIR}
18 | cat ${UKBPDIR}/ukb_possible_med_phenotypes_level3.csv | \
19 |   python ${PYDIR}/chembl/generate_atc_medication_annotations_sqlite.py --level=3 > \
20 |   ${UKBPDIR}/Anno_medications_BIN_atc_level3.csv
21 | 
22 | #echo '.3 Get data from mysql db, write to ${UKBPDIR}'
23 | #cat ${UKBPDIR}/ukb_possible_med_phenotypes_level3.csv | \
24 | #  python ${PYDIR}/chembl/generate_atc_medication_annotations.py --level=3 > \
25 | #  ${UKBPDIR}/Anno_medications_BIN_atc_level3.csv
26 | 
27 | # step 4 generate phenotypes
28 | echo '.4 write PheWAS phenos to' ${UKBPDIR}
29 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level3.csv | \
30 |   python ${PYDIR}/pheno/generate_medication_phenotypes.py \
31 |   --pfile=${UKBPDIR}/Anno_medications_BIN_atc_level3.csv > \
32 |   ${UKBPDIR}/med_phenotypes_level3.tsv
33 | #------------------------------------------------------------------------------------------------
34 | echo 'END'
35 | 


--------------------------------------------------------------------------------
/sh/pheno/make_ukb_phenotypes_atc_level2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # For level 2
 4 | # step 1 sort to eliminate duplicates - at this stage we have level3 codes, one row per
 5 | # participant per code
 6 | echo '.1 Write level2 codes to' ${UKBPDIR}
 7 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched.csv | \
 8 |   python ${PYDIR}/pheno/trim_atc_code.py --codelen=3 | \
 9 |   sort -u  > ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level2.csv
10 | 
11 | # step 2 get the list of possible phenotypes
12 | echo '.2 extract list of possible phenotype to' ${UKBPDIR}
13 | cut -f 3 -d ',' ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level2.csv | \
14 |   sort -u > ${UKBPDIR}/ukb_possible_med_phenotypes_level2.csv
15 | 
16 | # step 3 generate phenotype annotations, get data from the CHEMBL db
17 | echo '.3 Get data from sqlite db, write to' ${UKBPDIR}
18 | cat ${UKBPDIR}/ukb_possible_med_phenotypes_level2.csv | \
19 |   python ${PYDIR}/chembl/generate_atc_medication_annotations_sqlite.py --level=2 > \
20 |   ${UKBPDIR}/Anno_medications_BIN_atc_level2.csv
21 | 
22 | #echo '.3 Get data from mysql db, write to' ${UKBPDIR}
23 | #cat ${UKBPDIR}/ukb_possible_med_phenotypes_level2.csv | \
24 | #  python ${PYDIR}/chembl/generate_atc_medication_annotations.py --level=2 > \
25 | #  ${UKBPDIR}/Anno_medications_BIN_atc_level2.csv
26 | 
27 | # step 4 generate phenotypes
28 | echo '.4 write PheWAS phenos to' ${UKBPDIR}
29 | cat ${UKBPDIR}/ukb_20003_with_atc_codes_matched_sorted_level2.csv | \
30 |   python ${PYDIR}/pheno/generate_medication_phenotypes.py \
31 |   --pfile=${UKBPDIR}/Anno_medications_BIN_atc_level2.csv > \
32 |   ${UKBPDIR}/med_phenotypes_level2.tsv
33 | #------------------------------------------------------------------------------------------------
34 | echo 'END'
35 | 


--------------------------------------------------------------------------------
/py/chembl/generate_atc_medication_annotations.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import pymysql
 3 | import os, sys
 4 | from datahelper import Datahelper
 5 | from optparse import OptionParser
 6 | 
 7 | def main(options):
 8 |   """
 9 |   Access the CHEMBL db for each input line and use the description
10 |   from the appropriate level
11 |   """
12 | 
13 |   level = int(options.level)
14 | 
15 |   dh = Datahelper()
16 |   try:
17 |     chembl = pymysql.connect(host=os.environ["CHHOST"], user=os.environ["CHUSER"],  passwd=os.environ["CHPWD"],
18 |              port=int(os.environ["CHPORT"]), db=os.environ["CHDB"])
19 |   except:
20 |     #print "Unexpected error:", sys.exc_info()[0]
21 |     print("Unexpected error:", sys.exc_info())
22 |     exit()
23 | 
24 |   print("pheno,PHENOTYPE,Category,type")
25 | 
26 |   count = 0
27 |   for line in sys.stdin:
28 |     data = line.strip().split(',')
29 |     atc_code = data[0]
30 |     query = "select level{0}_description from atc_classification where level{1} = '{2}' limit 1".format(level, level, atc_code)
31 |     count = 0
32 |   
33 |     cursor = chembl.cursor()
34 |     cursor.execute(query)
35 |     for row in cursor:
36 |       count += 1
37 |       pheno_string = dh.get_normalised_phrase(row[0])
38 |       pheno_string = dh.make_pheno_string(pheno_string)
39 |       data.append(data[0] + "_" + pheno_string)
40 |       data.append(pheno_string)
41 |       data.append("BINARY")
42 |       print(','.join(data))
43 | 
44 |   chembl.close()
45 |   return count
46 | 
47 | # execution flow starts here
48 | #
49 | start_time = time.time()
50 | parser = OptionParser()
51 | parser.add_option("-l", "--level", dest="level",
52 |   help="ATC level", metavar="INT")
53 | (options, args) = parser.parse_args()
54 | 
55 | rcount = main(options)
56 | #print "END:", time.time() - start_time, "seconds", rcount
57 | 


--------------------------------------------------------------------------------
/sh/pheno/make_ukb_phenotypes_atc_prep.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # 
 4 | # Assign ATC level3 codes to participant,medication data 
 5 | # Includes steps to cut only the relevant columns from matched data 
 6 | # and to combine automatically matched data with manually 
 7 | # assigned codes
 8 | #
 9 | # Just get the matches from the manually edited file
10 | cp -f  ${ATCDATADIR}/results/atc_manual_matches.csv ${ATCDATADIR}/results/atc_manual_matches.csv.BAK
11 | egrep -vw "UKBB_code|NA" ${ATCDATADIR}/results/atc_manual_matches.csv > \
12 |   ${ATCDATADIR}/results/atc_manual_matches_detail.csv
13 | # Cut relevant columns
14 | cut -f 1,2,5 -d ',' ${ATCDATADIR}/results/atc_manual_matches_detail.csv > \
15 |   ${ATCDATADIR}/results/atc_manual_matches_cut.csv
16 | cut -f 1,2,6 -d ',' ${ATCDATADIR}/results/atc_matched.csv > \
17 |   ${ATCDATADIR}/results/atc_auto_matches_cut.csv
18 | 
19 | # Combine auto and manually matched codes
20 | cat ${ATCDATADIR}/results/atc_auto_matches_cut.csv \
21 |   ${ATCDATADIR}/results/atc_manual_matches_cut.csv > \
22 |   ${ATCDATADIR}/results/atc_all_matches.csv 
23 | 
24 | # Assign codes where possible to all items in the reported medication list
25 | cat ${UKBPDIR}/ukb_20003_reported_medication_n.csv | \
26 |   python ${PYDIR}/pheno/assign_codes_to_participant_data.py \
27 |     --codefile=${ATCDATADIR}/results/atc_all_matches.csv > \
28 |   ${UKBPDIR}/ukb_20003_with_atc_codes.csv
29 | 
30 | # Get list of unmatched participant medication data
31 | grep -w NA ${UKBPDIR}/ukb_20003_with_atc_codes.csv > \
32 |   ${UKBPDIR}/ukb_20003_with_atc_codes_unmatched.csv
33 | 
34 | # Get list of matched participant medication data (without a header) this
35 | # will feed into phenotype generation
36 | grep -wv NA ${UKBPDIR}/ukb_20003_with_atc_codes.csv | sed '1,1d' > \
37 |   ${UKBPDIR}/ukb_20003_with_atc_codes_matched.csv
38 | 
39 | 


--------------------------------------------------------------------------------
/sh/atc_post_process_match_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | source ${UKBPROJROOT}/env/common
 3 | # Post process the ATC results files
 4 | #
 5 | # differentiate between unmatched and matched data
 6 | egrep -w "NA" ${ATCDATADIR}/results/atc_res.csv > \
 7 | 	${ATCDATADIR}/results/atc_res_NA.csv
 8 | egrep -vw "NA" ${ATCDATADIR}/results/atc_res.csv > \
 9 | 	${ATCDATADIR}/results/atc_res_NONA.csv
10 | # add UKBB counts to matches and unmatches
11 | cat ${ATCDATADIR}/results/atc_res_NA.csv | \
12 | 	python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > \
13 | 	${ATCDATADIR}/results/atc_missing.csv
14 | cat ${ATCDATADIR}/results/atc_res_NONA.csv | \
15 | 	python ${PYDIR}/append_ukb_counts.py --ukbcfile=${UDATADIR}/UKB_counts.csv > \
16 | 	${ATCDATADIR}/results/atc_matched.csv
17 | # one_word matches are the most risky
18 | grep ":1," ${ATCDATADIR}/results/atc_matched.csv > \
19 | 	${ATCDATADIR}/results/atc_one_word_match_full.csv
20 | cut -f 1,2,3,5,6 -d ',' ${ATCDATADIR}/results/atc_one_word_match_full.csv > \
21 | 	${ATCDATADIR}/results/one_word_match_list.csv
22 | 
23 | # Following steps extract data for manual examination / intervention
24 | # all matches
25 | cut -f 1,2,6 -d ',' ${ATCDATADIR}/results/atc_matched.csv > \
26 | 	${ATCDATADIR}/results/atc_matched_list.csv
27 | sort ${ATCDATADIR}/results/atc_matched_list.csv > \
28 | 	${ATCDATADIR}/results/atc_matched_list_sorted.csv
29 | # looking at missing data
30 | cut -f 1,2,8 -d ',' ${ATCDATADIR}/results/atc_missing.csv > \
31 | 	${ATCDATADIR}/results/atc_unmatched_list.csv
32 | sort ${ATCDATADIR}/results/atc_unmatched_list.csv > \
33 | 	${ATCDATADIR}/results/atc_unmatched_list_sorted.csv
34 | # unique matched UKBB ids
35 | cut -f 1,2 -d ',' ${ATCDATADIR}/results/atc_matched.csv | \
36 | 	sort -u > ${ATCDATADIR}/results/atc_matched_unique.csv
37 | cut -f 1,2 -d ',' ${ATCDATADIR}/results/atc_matched.csv | \
38 | 	sort | \
39 | 	uniq -c | \
40 | 	sort -nr > ${ATCDATADIR}/results/atc_match_counts.csv
41 | 
42 | 


--------------------------------------------------------------------------------
/py/pheno/assign_codes_to_participant_data.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Assign Classification System codes to participant data
 3 | # Outputs "NA" where no code is present
 4 | # 
 5 | import time
 6 | import datetime
 7 | import re
 8 | import os, sys
 9 | import random
10 | import json
11 | from optparse import OptionParser
12 | from datahelper import Datahelper
13 | 
14 | def load_cs_codes(fh):
15 |   code_lookup = {}
16 | 
17 |   for line in fh:
18 |     data = line.strip().split(',')
19 |     if data[0] not in code_lookup:
20 |       code_lookup[data[0]] = []
21 |     code_lookup[data[0]].append(data[2])
22 | 
23 |   return code_lookup
24 | 
25 | def main(options):
26 |   count = 0
27 |   match_count = 0
28 |   miss_count = 0
29 |   dh = Datahelper()
30 | 
31 |   try:
32 |     fh = open(options.codefile, "r")
33 |     code_lookup = load_cs_codes(fh)
34 |     #print len(synonyms)
35 |   except IOError as e:
36 |     print("I/O error({0}): {1}".format(e.errno, e.strerror))
37 |     exit()
38 |   except TypeError as e:
39 |     print("Missing arguments ", e)
40 |     exit()
41 |   except:
42 |     #print "Unexpected error:", sys.exc_info()[0]
43 |     print("Unexpected error:", sys.exc_info())
44 |     exit()
45 | 
46 |   hdr = sys.stdin.readline().strip()
47 |   print("{0},{1}".format(hdr, "cs_code"))
48 | 
49 |   for line in sys.stdin:
50 |     count += 1
51 |     data = line.strip().split(',')
52 |     data.append("NA")
53 |     if data[1] in code_lookup:
54 |       for code in code_lookup[data[1]]:
55 |         data[-1] = code
56 |         match_count += 1
57 |         print(",".join(data))
58 |     else:
59 |       miss_count += 1
60 |       print(",".join(data))
61 | 
62 |   return count, match_count, miss_count
63 | 
64 | # execution flow starts here
65 | #
66 | start_time = time.time()
67 | parser = OptionParser()
68 | #
69 | parser.add_option("-c", "--codefile", dest="codefile",
70 |   help="UKBB vs CS code file", metavar="FILE")
71 | 
72 | (options, args) = parser.parse_args()
73 | 
74 | count, ycount, ncount = main(options)
75 | #print "END:", time.time() - start_time, "seconds", count, ycount, ncount
76 | 
77 | 


--------------------------------------------------------------------------------
/py/merge_chembl_synonyms.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Attempt to match CHEMBL synonyms with coding data
 3 | # Produce output with attached synonyms, where possible
 4 | # 
 5 | import time
 6 | import datetime
 7 | import re
 8 | import os, sys
 9 | import random
10 | import json
11 | from optparse import OptionParser
12 | from datahelper import Datahelper
13 | 
14 | def load_synonyms(fh):
15 |   """
16 |   Load WHOLE synonyms only into a python dictionary
17 |   which will then be used as a look-up for input data
18 |   Output coding data with synonyms attached, 
19 |   where possible
20 |   """
21 |   synonyms = {}
22 | 
23 |   for line in fh:
24 |     data = line.strip().split('\t')
25 |     syns1 = data[1].split('|')
26 |     syns = [x.strip() for x in syns1]
27 |     for syn in syns:
28 |       if syn not in synonyms: 
29 |         synonyms[syn] = []
30 |       for asyn in syns:
31 |         if asyn not in synonyms[syn]:
32 |           synonyms[syn].append(asyn)
33 | 
34 |   return synonyms
35 | 
36 | def main(options):
37 |   count = 0
38 |   mcount = 0
39 |   umcount = 0
40 |   dh = Datahelper()
41 | 
42 |   try:
43 |     fh = open(options.synfile, "r")
44 |     synonyms = load_synonyms(fh)
45 |   except IOError as e:
46 |     print("I/O error({0}): {1}".format(e.errno, e.strerror))
47 |     exit()
48 |   except TypeError as e:
49 |     print("Missing arguments ", e)
50 |     exit()
51 |   except:
52 |     print("Unexpected error:", sys.exc_info())
53 |     exit()
54 | 
55 |   for line in sys.stdin:
56 |     count += 1
57 |     data = line.strip().split(',')
58 |     phrase = data[1].lower()
59 |     matched = False
60 |     for key in dh.get_merge_key_list(phrase):
61 |       if key in synonyms:
62 |         print("{0},{1},{2}".format(data[0], phrase, '|'.join(synonyms[key])))
63 |         matched = True
64 |         mcount += 1
65 |         break
66 |     if matched == False:
67 |       print("{0},{1}".format(data[0], phrase))
68 | 
69 |   return count, mcount
70 | 
71 | # execution flow starts here
72 | #
73 | start_time = time.time()
74 | parser = OptionParser()
75 | #
76 | parser.add_option("-s", "--synfile", dest="synfile",
77 |   help="molecule synonyms", metavar="FILE")
78 | 
79 | (options, args) = parser.parse_args()
80 | 
81 | count, mcount = main(options)
82 | 
83 | 


--------------------------------------------------------------------------------
/py/pheno/generate_medication_phenotypes.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | # Generate binary phenotypes from the set of UKB
 3 | # participants reporting taking medications
 4 | # Load the column list as a code vs num array
 5 | #
 6 | # For each person, output a line which has 0's
 7 | # where the person has taken no medication in a category
 8 | # and 1's where they have
 9 | # HOW TO PROPERLY QC THIS?
10 | import time
11 | import datetime
12 | import re
13 | import os, sys
14 | import random
15 | import json
16 | from optparse import OptionParser
17 | 
18 | def load_pheno_names(pfile):
19 |   """
20 |   Here the column order for the output file is set
21 |   to the order of reading the file records
22 |   """
23 |   plookup = {}
24 |   pcolnames = []
25 |   col = 0
26 | 
27 |   hdr = pfile.readline()
28 | 
29 |   for line in pfile:
30 |     data = line.strip().split(",")
31 |     plookup[data[0]] = col
32 |     pcolnames.append(data[1])
33 |     col += 1
34 | 
35 |   return plookup, pcolnames
36 | 
37 | def main(options):
38 |   count = 0
39 |   synonyms = {}
40 |   ccol = int(options.codecol)
41 |   #codes = {}
42 | 
43 |   try:
44 |     fh = open(options.pfile, "r")
45 |     plookup, pcolnames = load_pheno_names(fh)
46 |   except:
47 |     print("Failed to open phenotype code file {0}".format(options.pfile))
48 |     sys.exit()
49 | 
50 |   #print "FID,IID,%s" % (",".join(pcolnames))
51 |   print("FID\tIID\t{0}".format("\t".join(pcolnames)))
52 |   #print len(pcolnames) + 2
53 | 
54 |   last_eid = ""
55 |   phen_array = ['0'] * len(pcolnames)
56 | 
57 |   for line in sys.stdin:
58 |     data = line.strip().split(",")
59 |     if data[0] != last_eid:
60 |       if last_eid != "":
61 |         #print "%s,%s,%s" % (last_eid, last_eid, ",".join(phen_array))
62 |         print("{0}\t{1}\t{2}".format(last_eid, last_eid, "\t".join(phen_array)))
63 |       phen_array = ['0'] * len(pcolnames)
64 |       last_eid = data[0]
65 |     if data[ccol] in plookup:
66 |       phen_array[plookup[data[ccol]]] = '1'
67 | 
68 |   #print "%s,%s,%s" % (last_eid, last_eid, ",".join(phen_array))
69 |   print("{0}\t{1}\t{2}".format(last_eid, last_eid, "\t".join(phen_array)))
70 |   return count
71 | 
72 | # execution flow starts here
73 | #
74 | start_time = time.time()
75 | parser = OptionParser()
76 | #
77 | parser.add_option("-p", "--pfile", dest="pfile",
78 |   help="phenotype code file", metavar="FILE")
79 | 
80 | parser.add_option("-c", "--codecol", dest="codecol",
81 |   help="column containing med code", metavar="STR")
82 | 
83 | (options, args) = parser.parse_args()
84 | 
85 | if options.codecol == None:
86 |   options.codecol = "2"
87 | 
88 | count = main(options)
89 | #print "END:", time.time() - start_time, "seconds", count
90 | 
91 | 


--------------------------------------------------------------------------------
/py/pheno/cut_main_csv_file.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Cuts columns of interest from the main UKB phenotype cvs file
 3 | #
 4 | # Takes account of multiple column version contained within the file by matching column prefixes
 5 | # in column headers
 6 | #
 7 | # Always outputs the eid field as the first one, followed by any column with the required prefix 
 8 | #
 9 | import time
10 | import datetime
11 | import os, sys
12 | import csv
13 | from optparse import OptionParser
14 | 
15 | start_time = time.time()
16 | sys.stdout.flush()
17 | 
18 | def main(options):
19 |   csvreader = None
20 |   count=0
21 |   idcol = int(options.idcol)
22 | 
23 |   try:
24 |     csvfile =  open(options.csvfile, "r")
25 |     csvreader = csv.reader(csvfile)
26 |   except IOError as e:
27 |     print("I/O error({0}): {1}".format(e.errno, e.strerror))
28 |     exit()
29 |   except TypeError as e:
30 |     print("Missing arguments ", e)
31 |     exit()
32 |   except:
33 |     print("Unexpected error:", sys.exc_info())
34 |     sys.exit()
35 | 
36 |   colprefs = options.colprefs.split(',')
37 |   #print colprefs
38 |   cols = []
39 |   outhdr = []
40 | 
41 |   try:
42 |     hdr = next(csvreader)
43 |     #print len(hdr)
44 |     outhdr.append(hdr[idcol])
45 |     cols.append(idcol)
46 |     # Process the header record to capture column indices and build the output 
47 |     # header record
48 |     for colpref in colprefs:
49 |       for i, col in enumerate(hdr):
50 |         coldata=col.split('-')
51 |         if coldata[0] == colpref:
52 |           outhdr.append(col)
53 |           #print i, coldata
54 |           cols.append(i)
55 |     print(",".join(outhdr))
56 |     #print cols
57 |     #print len(outhdr)
58 | 
59 |     for row in csvreader:
60 |       outrec=[]
61 |       count += 1
62 |       # iterate over each row element
63 | #      for i,elem in enumerate(row):
64 | #        if i in cols:
65 | #          outrec.append(elem)
66 |       for idx in cols:
67 |         outrec.append(row[idx])
68 |       print(",".join(outrec))
69 |   except:
70 |     print("Unexpected error (2):", sys.exc_info()[0])
71 |     print(sys.exc_info())
72 |     sys.exit()
73 | 
74 |   return count
75 | #
76 | # execution flow starts here
77 | #
78 | parser = OptionParser()
79 | parser.add_option("-c", "--csvfile", dest="csvfile",
80 |   help="csv file containing main UKB data", metavar="FILE")
81 | # col prefixes are comma separated - no complaint is made if a prefix doesn't exist in the data
82 | parser.add_option("-p", "--colprefs", dest="colprefs",
83 |   help="UKB column prefixes", metavar="FILE")
84 | 
85 | parser.add_option("-i", "--idcol", dest="idcol",
86 |   help="Column numbef of id, default 0", metavar="INT")
87 | 
88 | (options, args) = parser.parse_args()
89 | if options.idcol == None:
90 |   options.idcol = "0"
91 | #
92 | rec_count = main(options)
93 | #print "END:", time.time() - start_time, "seconds", rec_count
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/py/code_data_match.py:
--------------------------------------------------------------------------------
 1 | # The main text match process
 2 | #
 3 | # Dictionaries:
 4 | # 
 5 | # Classification system code file data fields:
 6 | # 1 the code
 7 | # 2 the description
 8 | # 3 synonyms added from CHEMBL (separated by '|')
 9 | # 
10 | import time
11 | import datetime
12 | import re
13 | import string
14 | import os, sys
15 | from optparse import OptionParser
16 | from datahelper import Datahelper
17 | 
18 | def main(options):
19 |   """
20 |   The main match process - look up descriptions and 
21 |   synonyms in the coding data dictionary (loaded
22 |   on initialisation
23 |   SEE ALSO: datahelper.py
24 |   """
25 |   dcount = 0
26 |   count = 0
27 |   match_count = 0
28 |   miss_count = 0
29 | 
30 |   # try to load the classification system codes file
31 |   try:
32 |     fh = open(options.clsfile, "r")
33 |     dh = Datahelper()
34 |     dcount = dh.load_cls_phrases(fh)
35 |     #print "Dictionary size = %d" % (dcount)
36 |   except IOError as e:
37 |     print("I/O error({0}): {1}".format(e.errno, e.strerror))
38 |     print("I/O error:", sys.exc_info())
39 |     exit()
40 |   except TypeError as e:
41 |     print("Missing arguments ", e)
42 |     exit()
43 |   except:
44 |     #print "Unexpected error:", sys.exc_info()[0]
45 |     print("Unexpected error:", sys.exc_info())
46 |     exit()
47 | 
48 |   # stdin used to read in medications coding data
49 |   #hdr = sys.stdin.readline()
50 |   for line in sys.stdin:
51 |     count += 1
52 |     matched = False
53 |     data = line.strip().split(',')
54 |     all_phrases = [data[1]]
55 |     if len(data) == 3:
56 |       all_phrases += data[2].split('|')
57 | 
58 |     match_string = ""
59 |     code_array, match_data, last_match, selected_code = dh.match_all_phrases(all_phrases)
60 |     if len(code_array) > 0:
61 |       if (options.multioutput == True): 
62 |         # Current policy: output one line per code match (can be multiple per input record
63 |         for code_elem in code_array:
64 |           code_data = code_elem.split("~")
65 |           print("{0},{1},{2},{3},{4},{5},{6}".format(data[0], data[1], last_match, '|'.join(match_data), code_data[1], code_data[0], len(code_array)))
66 |       else:
67 |         print("{0},{1},{2},{3},{4},{5},{6}".format(data[0], data[1], last_match, '|'.join(match_data), 0, selected_code, len(code_array)))
68 |       match_count += 1
69 |     else:      
70 |       print("{0},{1},{2},{3},{4},{5},{6}".format(data[0], data[1], last_match, '|'.join(match_data), "NA", "NA", 0))
71 |       miss_count += 1
72 | 
73 |   return count, match_count, miss_count 
74 | 
75 | 
76 | # execution flow starts here
77 | #
78 | parser = OptionParser()
79 | parser.add_option("-c", "--clsfile", dest="clsfile",
80 |   help="file contains input classification system codes and descriptions", metavar="FILE")
81 | parser.add_option("-m", "--multioutput", dest="multioutput",
82 |   help="output multiple classification codes per source system line", metavar="STR")
83 | 
84 | start_time = time.time()
85 | (options, args) = parser.parse_args()
86 | 
87 | if options.multioutput == "Y":
88 |   options.multioutput = True
89 | else:
90 |   options.multioutput = False
91 | 
92 | count, match_count, miss_count = main(options)
93 | #print "END:", time.time() - start_time, "seconds", count, match_count, miss_count
94 | 
95 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at pdappleby@gmail.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # UK Biobank Self Reported Medication Data parsing and matching
 2 | ## Background
 3 | UK Biobank self reported medications are represented in UK Biobank as a list of codes per participant captured at the time of the Baseline Assessment Interview. 
 4 | 
 5 | For example, participant id '000001' may have reported medications with UKBB codes 1140922174, 1140879424, 1140879616, 1197 and 2038460150.
 6 | 
 7 | These are described in the UKBB medication codes table as:
 8 | 
 9 | | Code     | Description                | Report Count |
10 | | -------- | -------------------------- | -----------: |
11 | |1140922174|alendronate sodium          |6380          |
12 | |1140879424|alverine                    |308           |
13 | |1140879616|amitriptyline               |10119         |
14 | |1197      |evening primrose oil product|1132          |
15 | |2038460150|paracetamol                 |100036        |
16 | 
17 | There is no structure in the data and no means of grouping medication into categories such as "Drugs for Diabetes" or "Drugs to control Asthma". 
18 | 
19 | ## Aims
20 | Write and test software to match terms in the UK Biobank Self Reported Medication data coding table with terms in both the Anatomical Therapeutic Chemical (ATC) classification system and in the British National Formulary (BNF) coding system. The overall aim being to assign higher level well-known codes to allow grouping of the data. The resulting matched data can then be used in conjuction with the UKBB medication codes assigned at assessment time to generate evidence for use in both individual clinical phenotypes and in ranges of clincal phenotypes as found in Phenome-Wide association studies (PheWAS).
21 | 
22 | ## Description
23 | Matching code is written in Python (2.7 was initially used for development, changes for Python 3 have now been made). Extensive use of Bash shell wrappers is made to supply context at run time - code and data directory locations, data file names and database access parameters. 
24 | 
25 | The key features to note are;
26 | 
27 | - Matching is performed using combinations of whole words only (no partial word matches) and, in the case of matching synonyms to description (Step 01), whole phrase matching is used.
28 | 
29 | - Synonyms from the ChEMBL database are attached to both 'sides' of the main medication term match, as part of data preparation.
30 | 
31 | - During testing an excluded word list was built (cf. stop words from Natural Language Processing) to prevent the software from making unwanted one-word matches, as part of an iterative process. 
32 | 
33 | - All code is intended to be run from a Linux / Unix command line. 
34 | 
35 | The following subdirectories can be found in the repository:
36 | 
37 | - *env/* Environment variables used are shown in a single file 'common_tplt', users should complete these and copy to a file named 'common', users must also pre-define the **PROJDATA** and **PROJROOT** environment variables as these are used as roots to data and code directory trees. Parameters for local chembl database access are also required for drug synonym extraction and ATC code extraction. The code example supplied is for the sqlite edition of the chembl database which can be found at: [The ChEMBL site, download section](https://www.ebi.ac.uk/chembl), the project used version 23, which is no longer the latest version at the time of writing. Downloads for other DBMS's are available.
38 | 
39 | - *py/* Python scripts, scripts to match synonyms prior to matching across coding systems. Also included is the module 'datahelper.py' which is where the text matching code is to be found. There are two lower level directories where scripts to extract and format ChEMBL data and to process phenotype data reside.
40 | 
41 | - *sh/* Bash shell scripts, wrappers for the python code split into several main functions and provided for coding against the ATC and BNF classification systems.
42 | 
43 | Top-level scripts are prefixed '01_', '02_', '03_' and '04' for each classification system and call several lower level bash scripts. The '03_' scripts are for code assignment to data for individual medication reports in UKBB, the path and filename for UKBB phenotype data for a project must be supplied via the sourced 'common' environment parameter file.
44 | 
45 | - *data/* Generated match data for both ATC and BNF coding, this does not include manually assigned coding. The excluded words list, which probably should be an independent text file is embedded in ../py/datahelper.py and can be extracted using the .../*py*/list_excl_words.py script. 
46 | 
47 | ## Running
48 | Once the environment has been set up (see the note on the .../*env*/common file above), four scripts are run, with parameters for either ATC or BNF code assignment, through to phenotype generation (0,1) for PheWAS.
49 | 
50 | Script '01' runs data preparation steps, note that raw BNF data is not supplied in this repository due to potential licensing requirements. At this point ChEMBL synonyms and, in the case of the ATC-based match, ATC codes and terms are extracted from the ChEMBL database
51 | 
52 | Script '02' runs matching scripts to 'merge' in ChEMBL synonym data and produce data on matched and unmatched UKBB medication codes.
53 | 
54 | Script '03' calls scripts to extract medication detail data from the main UKBB phenotype csv file (the name of this varies by project and should be assigned in the .../*env*/common file via an environment variable) and then to assign the ATC or BNF codes output in step 02.
55 | 
56 | Script '04' calls scripts to annotate and generate PheWAS phenotypes as binary 1 (CASE) or 0 (CONTROL)
57 | 
58 | For ATC-based matching run:
59 | - 01_atc_prepare_sqlite.sh
60 | - 02_atc_match.sh
61 | - 03_get_ukbb_srmed_data_atc.sh
62 | - 04_make_atc_phewas_phenotypes.sh
63 | 
64 | For BNF-based matching run:
65 | - 01_bnf_prepare_sqlite.sh
66 | - 02_bnf_match.sh
67 | - 03_get_ukbb_srmed_data_bnf.sh
68 | - 04_make_bnf_phewas_phenotypes.sh
69 | 
70 | ## Flow Summary, steps 01 and 02
71 | ![](images/ukbb_srmed.png)
72 | 
73 | ## Notice
74 | 
75 | BNF coding system data was obtained and is presented in accordance with the “Open Government Licence for Public Sector Information” (http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/) and the NHS Business Services Authority (NHSBSA) “Terms and Conditions for Users” (https://www.nhsbsa.nhs.uk/our-policies/terms-and-conditions).  Authors: PDA, ASFD and ERJ as users of this resource also make the following statement consistent with these terms and conditions:  “NHSBSA BNF Classification Coding, NHSBSA Copyright 2019” This information is licenced under the terms of the Open Government Licence.
76 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/py/datahelper.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import Counter
  3 | #
  4 | # Methods for assisting with matching,
  5 | # generating dictionary and matching keys,
  6 | # listing exclusion words and applying exclusion rules
  7 | # 
  8 | class Datahelper:
  9 |   def __init__(self):
 10 |     self.cls_phrases = {}
 11 |     # Explicit exclusions - short words and measures are excluded in code
 12 |     # NLP systems / libraries, such as the NLTK, refer to these as "stop-words"
 13 |     self.excluded_words = {
 14 |       'acting': 1,
 15 |       'active': 1,
 16 |       'activated': 1,
 17 |       'artificial': 1,
 18 |       'band': 1,
 19 |       'coconut': 1,
 20 |       'food': 1,
 21 |       'biscuit': 1,
 22 |       'biscuits': 1,
 23 |       'good': 1,
 24 |       'soft': 1,
 25 |       'choice': 1,
 26 |       'house': 1,
 27 |       'half': 1,
 28 |       'total': 1,
 29 |       'alka': 1,
 30 |       'alpha': 1,
 31 |       'beta': 1,
 32 |       'night': 1,
 33 |       'nurse': 1,
 34 |       'dome': 1,
 35 |       'continus': 1,
 36 |       'depot': 1,
 37 |       'mini': 1,
 38 |       'micro': 1,
 39 |       'over': 1,
 40 |       'long': 1,
 41 |       'slow': 1,
 42 |       'daily': 1,
 43 |       'hayfever': 1,
 44 |       'counter': 1,
 45 |       'mild': 1,
 46 |       'with': 1,
 47 |       'other': 1,
 48 |       'single': 1,
 49 |       'double': 1,
 50 |       'triple': 1,
 51 |       'once': 1,
 52 |       'flavour': 1,
 53 |       'fruit': 1,
 54 |       'cream': 1,
 55 |       'need': 1,
 56 |       'needs': 1,
 57 |       'vera': 1,
 58 |       'infusion': 1,
 59 |       'succinate': 1,
 60 |       'palmitate': 1,
 61 |       'intensol': 1,
 62 |       'poly': 1,
 63 |       'prep': 1,
 64 |       'bag': 1,
 65 |       'bags': 1,
 66 |       'preparation': 1,
 67 |       'preparations': 1,
 68 |       'preps': 1,
 69 |       'shampoo': 1,
 70 |       'shower': 1,
 71 |       'wash': 1,
 72 |       'enema': 1,
 73 |       'soap': 1,
 74 |       'solution': 1,
 75 |       'soln': 1,
 76 |       'contact': 1,
 77 |       'incontinence': 1,
 78 |       'diluent': 1,
 79 |       'blocker': 1,
 80 |       'emulsion': 1,
 81 |       'emuls': 1,
 82 |       'emulsifying': 1,
 83 |       'lotion': 1,
 84 |       'lotio': 1,
 85 |       'derm': 1,
 86 |       'aveeno': 1,
 87 |       'soluble': 1,
 88 |       'suspension': 1,
 89 |       'susp': 1,
 90 |       'various': 1,
 91 |       'paint': 1,
 92 |       'liquid': 1,
 93 |       'tablet': 1,
 94 |       'tablets': 1,
 95 |       'pill': 1,
 96 |       'pills': 1,
 97 |       'perles': 1,
 98 |       'pastille': 1,
 99 |       'chewable': 1,
100 |       'granules': 1,
101 |       'mixture': 1,
102 |       'mixtures': 1,
103 |       'remedy': 1,
104 |       'remedies': 1,
105 |       'therapy': 1,
106 |       'therapies': 1,
107 |       'emollient': 1,
108 |       'peel': 1,
109 |       'gppe': 1,
110 |       'ointment': 1,
111 |       'effervescent': 1,
112 |       'capsule': 1,
113 |       'spansule': 1,
114 |       'caplet': 1,
115 |       'cycle': 1,
116 |       'husk': 1,
117 |       'strong': 1,
118 |       'strength': 1,
119 |       'suppository': 1,
120 |       'supplement': 1,
121 |       'compound': 1,
122 |       'comp': 1,
123 |       'caustic': 1,
124 |       'pellet': 1,
125 |       'elixir': 1,
126 |       'drops': 1,
127 |       'autohaler': 1,
128 |       'turbohaler': 1,
129 |       'inhaler': 1,
130 |       'sach': 1,
131 |       'sachet': 1,
132 |       'sachets': 1,
133 |       'syrup': 1,
134 |       'dried': 1,
135 |       'castor': 1,
136 |       'oilatum': 1,
137 |       'oily': 1,
138 |       'salt': 1,
139 |       'salts': 1,
140 |       'saline': 1,
141 |       'yeast': 1,
142 |       'tears': 1,
143 |       'ophthalmic': 1,
144 |       'complexes': 1,
145 |       'comp': 1,
146 |       'aqua': 1,
147 |       'aqueous': 1,
148 |       'hormone': 1,
149 |       'sugar': 1,
150 |       'plain': 1,
151 |       'anti': 1,
152 |       'retard': 1,
153 |       'drug': 1,
154 |       'lozenge': 1,
155 |       'lozenges': 1,
156 |       'nasal': 1,
157 |       'spray': 1,
158 |       'paste': 1,
159 |       'tincture': 1,
160 |       'oral': 1,
161 |       'injection': 1,
162 |       'injectable': 1,
163 |       'applicator': 1,
164 |       'ampoule': 1,
165 |       'syringe': 1,
166 |       'topical': 1,
167 |       'duopack': 1,
168 |       'pack': 1,
169 |       'combination': 1,
170 |       'combinations': 1,
171 |       'prefilled': 1,
172 |       'continuous': 1,
173 |       'dispersible': 1,
174 |       'patch': 1,
175 |       'gastro': 1,
176 |       'resistant': 1,
177 |       'allergy': 1,
178 |       'relief': 1,
179 |       'wool': 1,
180 |       'sand': 1,
181 |       'tube': 1,
182 |       'stnd': 1,
183 |       'aloe': 1,
184 |       'ortho': 1,
185 |       'auto': 1,
186 |       'health': 1,
187 |       'cover': 1,
188 |       'bath': 1,
189 |       'powder': 1,
190 |       'resin': 1,
191 |       'solvent': 1,
192 |       'solv': 1,
193 |       'mist': 1,
194 |       'saliva': 1,
195 |       'balsam': 1,
196 |       'tonic': 1,
197 |       'additive': 1,
198 |       'liniment': 1,
199 |       'recon': 1,
200 |       'combined': 1,
201 |       'dual': 1,
202 |       'substitute': 1,
203 |       'formula': 1,
204 |       'green': 1,
205 |       'yellow': 1,
206 |       'red': 1,
207 |       'blue': 1,
208 |       'orange': 1,
209 |       'buff': 1,
210 |       'golden': 1,
211 |       'white': 1,
212 |       'paed': 1,
213 |       'paediatric': 1,
214 |       'peppermint': 1,
215 |       'mint': 1,
216 |       'pine': 1,
217 |       'caramel': 1,
218 |       'natural': 1,
219 |       'vitamin': 1,
220 |       'vitamins': 1,
221 |       'enzyme': 1,
222 |       'product': 1,
223 |       'junior': 1,
224 |       'cold': 1,
225 |       'unknown': 1,
226 |       'free': 1,
227 |       'body': 1,
228 |       'nose': 1,
229 |       'sinus': 1,
230 |       'stomach': 1,
231 |       'scalp': 1,
232 |       'intramuscular': 1,
233 |       'sublingual': 1,
234 |       'breath': 1,
235 |       'sleep': 1,
236 |       'drowsy': 1,
237 |       'litre': 1,
238 |       'actuated': 1,
239 |       'vantage': 1,
240 |       'numark': 1,
241 |       'care': 1,
242 |       'galpharm': 1,
243 |       'merck': 1,
244 |       'pharmacy': 1,
245 |       'fish': 1,
246 |       'aluminium': 1,
247 |       'calcium': 1,
248 |       'sodium': 1,
249 |       'cromoglycate': 1,
250 |       'potassium': 1,
251 |       'chloride': 1,
252 |       'nitrate': 1,
253 |       'sulphate': 1,
254 |       'salicylic': 1,
255 |       'hydrochloride': 1,
256 |       'disodium': 1,
257 |       'zinc': 1,
258 |       'magnesium': 1,
259 |       'breathe': 1,
260 |       'wound': 1,
261 |       'citrate': 1,
262 |       'sulfate': 1,
263 |       'calmurid': 1,
264 |       'fucibet': 1,
265 |       'fucidin': 1,
266 |       'betnesol': 1,
267 |       'tobradex': 1,
268 |       'nystaform': 1,
269 |       'orabase': 1,
270 |       'betnovate': 1,
271 |       'polystyrene': 1,
272 |       'undecenoic': 1,
273 |       'oxide': 1,
274 |       'phosphate': 1,
275 |       'hydrate': 1,
276 |       'acetate': 1,
277 |       'fumarate': 1,
278 |       'sandoz': 1,
279 |       'pain': 1,
280 |       'mite': 1,
281 |       'remover': 1,
282 |       'removers': 1,
283 |       'acid': 1,
284 |       'alcohol': 1,
285 |       'coal': 1,
286 |       'extract': 1,
287 |       'mineral': 1,
288 |       'minerals': 1,
289 |       'forte': 1,
290 |       'simple': 1,
291 |       'plus': 1,
292 |       'multi': 1,
293 |       'vita': 1,
294 |       'adult': 1,
295 |       'liver': 1,
296 |       'skin': 1,
297 |       'factor': 1,
298 |       'human': 1,
299 |       'methyl': 1,
300 |       'piperazine': 1,
301 |       'deep': 1,
302 |       'ultra': 1,
303 |       'daktarin': 1,
304 |       'voltarol': 1,
305 |       'insulin': 1,
306 |       'panoxyl': 1,
307 |     }
308 |     self.valid_short_words = {
309 |       'gtn': 1,
310 |     }
311 | 
312 |   def load_cls_phrases(self, fh):
313 |     """
314 |     Build a dictionary of key phrases and words vs lists of codes from
315 |     Classification System Data: see self.get_key_list(phrase)
316 |     """
317 |     for line in fh:
318 |       data = line.strip().split(',')
319 |       # guards against unparseable lines
320 |       if len(data) < 2:
321 |         continue
322 |       code = data[0]
323 |       phrase_array = [data[1].lower().strip()]
324 |       if len(data) > 2:
325 |         syn_array = data[2].lower().strip().split('|')
326 |         for syn in [s for s in syn_array if s not in phrase_array]:
327 |           phrase_array.append(syn) 
328 | 
329 |       for phrase in phrase_array:
330 |         for key in set(self.get_key_list(phrase)):
331 |           if key not in self.cls_phrases:
332 |             self.cls_phrases[key] = []
333 |           self.cls_phrases[key].append(code)
334 | 
335 |     return len(self.cls_phrases)
336 | 
337 |   def get_phrase_dictionary(self):
338 |     return self.cls_phrases
339 | 
340 |   def get_phrase_dictionary_keys(self):
341 |     return sorted(self.cls_phrases.keys())
342 | 
343 |   def get_excluded_words(self):
344 |     return sorted(self.excluded_words.keys())
345 | 
346 |   def match_all_phrases(self, inphrases):
347 |     """
348 |     The most complicated function
349 |     Attempt to match the argument phrases to the cls_phrases dictionary 
350 |     First attempt a match of all phrase, then all trigrams, then all
351 |     bigrams, then single words 
352 | 
353 |     Return:
354 |     A list of matched_code counts, match_path, the matched phrase, the
355 |     most commonly matched code(s) 
356 |     OR 
357 |     [], match path, last attempted match, None
358 |     """
359 | #   temporary - attempted matches
360 |     attempted_matches = []
361 |     phrase_attempts = {}
362 |     phrase = ""
363 |     step = "A"
364 |     # ALL full phrases 
365 |     for phrase in inphrases:
366 |       phrase_attempts[phrase] = 1
367 |       attempted_matches.append(phrase + ':' + step)
368 |       if phrase in self.cls_phrases:
369 |         match_choices = self.cls_phrases[phrase]
370 |         return (self.get_list_counts(match_choices), attempted_matches, 
371 |           phrase, self.get_most_common(match_choices))
372 | 
373 |     # Normalised version of ALL all full phrases 
374 |     phrases = [self.get_normalised_phrase(p) for p in inphrases]
375 | 
376 |     # 3 all prefix trigrams 
377 |     step = "3"
378 |     for ngram in [p.split()[0:3] for p in phrases if len(p.split()) > 2]:
379 |       phrase = ' '.join(ngram)
380 |       phrase_attempts[phrase] = 1
381 |       attempted_matches.append(phrase + ':' + step)
382 |       if phrase in self.cls_phrases:
383 |         match_choices = self.cls_phrases[phrase]
384 |         return (self.get_list_counts(match_choices), attempted_matches, 
385 |           phrase, self.get_most_common(match_choices))
386 | 
387 |     # 2 all prefix bigrams 
388 |     step = "2"
389 |     for ngram in [p.split()[0:2] for p in phrases if len(p.split()) > 1]:
390 |       phrase = ' '.join(ngram)
391 |       phrase_attempts[phrase] = 1
392 |       attempted_matches.append(phrase + ':' + step)
393 |       if phrase in self.cls_phrases:
394 |         match_choices = self.cls_phrases[phrase]
395 |         return (self.get_list_counts(match_choices), attempted_matches, 
396 |           phrase, self.get_most_common(match_choices))
397 | 
398 |     # 1 all valid words 
399 |     step = "1"
400 |     for phr_elem in phrases:
401 |       for phrase in [w.strip() for w in phr_elem.split() 
402 |           if self.isExcluded(w.strip()) == False and w.strip() not in phrase_attempts]:
403 |         phrase_attempts[phrase] = 1
404 |         attempted_matches.append(phrase + ':' + step)
405 |         if phrase in self.cls_phrases:
406 |           match_choices = self.cls_phrases[phrase]
407 |           return (self.get_list_counts(match_choices), attempted_matches, 
408 |             phrase, self.get_most_common(match_choices))
409 | 
410 |     return [], attempted_matches, phrase, None
411 | 
412 |   def match_phrase(self, phrase):
413 |     """
414 |     NOT USED CURRENTLY
415 |     Attempt to match the argument phrase to the cls_phrases dictionary 
416 |     (built from all words passed in at init time)
417 |     A phrase is matched iff:
418 |     The whole string matches matches OR
419 |     The prefix trigram matches OR
420 |     The prefix bigram matches OR
421 |     A single word, which is not an excluded word, matches  
422 | 
423 |     Return:
424 |     A matched code from the classification system or None
425 |     """
426 |     key = None
427 |     match_phrase = None
428 |     for key in self.get_key_list(phrase):
429 |       if key in self.cls_phrases:
430 |         match_phrase = key
431 |         break
432 |     
433 |     if match_phrase == None:
434 |       return None, key
435 |     return self.get_most_common(self.cls_phrases[match_phrase]), key
436 | 
437 |   def get_key_list(self, phrase):
438 |     key_list = []
439 |     if self.isExcluded(phrase) == False:
440 |       key_list = [phrase]
441 | 
442 |     ngram = self.get_normalised_phrase(phrase)
443 |     if self.isExcluded(ngram) == False and ngram not in key_list:
444 |       key_list.append(ngram)
445 |     word_list = ngram.split()
446 |     if len(word_list) > 2:
447 |       key_list.append(' '.join(word_list[0:3]))
448 |     if len(word_list) > 1:
449 |       key_list.append(' '.join(word_list[0:2]))
450 | 
451 |     for word in [x for x in word_list if self.isExcluded(x.strip()) == False]:
452 |       if word not in key_list:
453 |         key_list.append(word)
454 | 
455 |     return key_list
456 | 
457 |   def get_merge_key_list(self, phrase):
458 |     """
459 |     Get a list of keys for use while merging synonyms
460 |     """
461 |     key_list = []
462 |     if self.isExcludedFromMerge(phrase) == False:
463 |       key_list = [phrase]
464 | 
465 |     ngram = self.get_normalised_phrase(phrase)
466 |     if self.isExcluded(ngram) == False and ngram not in key_list:
467 |       key_list.append(ngram)
468 |     word_list = ngram.split()
469 |     if len(word_list) > 2:
470 |       key_list.append(' '.join(word_list[0:3]))
471 |     if len(word_list) > 1:
472 |       key_list.append(' '.join(word_list[0:2]))
473 | 
474 |     for word in [x for x in word_list if self.isExcludedFromMerge(x.strip()) == False]:
475 |       if word not in key_list:
476 |         key_list.append(word)
477 | 
478 |     return key_list
479 | 
480 |   def get_key_list_whole_phrases(self, phrase):
481 |     """
482 |     EXPERIMENTAL: Get a list of keys from whole phrases
483 |     """
484 |     key_list = [phrase]
485 |     ngram = self.get_normalised_phrase(phrase)
486 |     key_list.append(ngram)
487 | 
488 |     return key_list
489 | 
490 |   def isExcluded(self, word):
491 |     """
492 |     Used in the main match
493 |     """
494 |     return ((self.isExcludedWord(word) != False) 
495 |         or (self.isMeasure(word) != False) 
496 |         or (self.isAllDigits(word) != False) 
497 |         or (self.isShortWord(word) != False))
498 | 
499 |   def isExcludedFromMerge(self, word):
500 |     """
501 |     Used when buliding dictionaries for use in 
502 |     synonym merging
503 |     """
504 |     return ((self.isExcludedWord(word) != False) 
505 |         or (self.isMeasure(word) != False) 
506 |         or (self.isShortWord(word) != False))
507 | 
508 |   def isExcludedWord(self, word):
509 |     """
510 |     """
511 |     return word in self.excluded_words
512 | 
513 |   def get_most_common(self, lst):
514 |     """
515 |     Return the most commonly occuring value in a list
516 |     THIS NEEDS RE-IMPLEMENTING - not currently used
517 |     """
518 |     data = Counter(lst)
519 |     mc = data.most_common(2) 
520 |     return data.most_common(1)[0][0]
521 | 
522 |   def get_list_counts(self, lst):
523 |     """
524 |     Return counts for data elements in a list
525 |     """
526 |     counts = Counter(lst)
527 |     return [c + "~" + str(counts[c]) for c in sorted(counts)]
528 | 
529 |   def get_best_guess(self, lst):
530 |     """
531 |     NOT USED
532 |     Return the best guess at a value from 
533 |     a list of strings
534 |     """
535 |     maxlen = 0
536 |     pass
537 | 
538 |   def make_pheno_string(self, words):
539 |   	return re.sub(r' +', '_', words).lower()
540 | 
541 |   def get_normalised_phrase(self, sentence):
542 |     """
543 |     Regex to replace (multiples of) 
544 |     non-word characters and space with a single space 
545 |     """
546 |     return re.sub(r'[\W_ ]+', ' ', sentence).lower()
547 | 
548 |   def format_digit_code(self, code, level=3):
549 |     code = code.strip()
550 |     ch = code[0:2]
551 |     s = code[2:4]
552 |     ss = '00'
553 |     if len(code) >=6:
554 |       ss = code[4:6]
555 |     if ss != '00' and level == 3:
556 |       return "%d.%d.%d" % (int(ch),int(s),int(ss))
557 |     return "%d.%d" % (int(ch),int(s))
558 | 
559 |   def format_atc_code(self, code, size=4):
560 |     return code[:size]
561 | 
562 |   def isMeasure(self, word):
563 |     """
564 |     Do we have a stand alone measure symbol
565 |     """
566 |     return ((re.match('\d+(mg$|ml$|iu$|mcg$|uml$|u1ml$|mg4ml$|micrograms$|million$|cm$|mm$|unit$|units$|hb$)', word)) != None)
567 | 
568 |   def isAllDigits(self, word):
569 |     """
570 |     Does the word consist of only digits?
571 |     """
572 |     return ((re.match('^\d+$', word)) != None) 
573 | 
574 |   def isShortWord(self, word):
575 |     """
576 |     Check if the word is longer than 3 chars 
577 |     """
578 |     return len(word) < 4 and word not in self.valid_short_words 
579 | 
580 |   def isSingleLetter(self, word):
581 |     """
582 |     Check if the word consists of a single letter?
583 |     """
584 |     return (re.match('^\w$', word)) != None
585 | 


--------------------------------------------------------------------------------
/data/atc_unmatched_list.csv:
--------------------------------------------------------------------------------
  1 | UKBB_code,UKBB_description
  2 | 1140909674,cod liver oil capsule
  3 | 1140923346,co-codamol
  4 | 1140916682,evening primrose oil
  5 | 1140876592,multivitamin+mineral preparations
  6 | 1140911732,garlic product
  7 | 1140883066,insulin product
  8 | 1140888538,zinc product
  9 | 1140923350,co-dydramol
 10 | 1140870788,calcium salts
 11 | 1140865354,gaviscon liquid
 12 | 1140852948,calcium+vitamin d 500units tablet
 13 | 1189,co-enzyme q10/ubiquinone/bio-quinone/coenzyme q10
 14 | 1199,food supplement/plant/herbal extract
 15 | 1140882694,betnovate cream
 16 | 1140925800,movicol oral powder
 17 | 1140851812,gtn 400micrograms spray
 18 | 1140865010,viscotears liquid eye gel
 19 | 1141145812,minerals - magnesium
 20 | 1197,evening primrose oil product
 21 | 1141180036,fybogel orange s/f granules
 22 | 1141168326,kliovance 1mg/0.5mg tablet
 23 | 1140922804,premique 0.625mg/5mg tablet
 24 | 1201,st john's wort/hypericum [ctsu]
 25 | 1140878226,diprobase cream
 26 | 1140911730,flax oil tablet
 27 | 1140922562,femoston 1/10 tablet
 28 | 1140857636,prempak 0.625 tablet
 29 | 1140911680,starflower oil
 30 | 1140923336,co-tenidone
 31 | 1140869180,microgynon 30 tablet
 32 | 1140911736,ginseng product
 33 | 1205,saw palmetto product
 34 | 1140871168,voltarol 25mg e/c tablet
 35 | 1140865396,buscopan 10mg tablet
 36 | 1203,aloe vera product
 37 | 1140923402,co-amilofruse
 38 | 1140876404,aqueous cream bp
 39 | 1141168752,peptac liquid
 40 | 1140865416,colpermin 0.2ml m/r gel e/c capsule
 41 | 1141176732,carbomers
 42 | 1141172918,celluvisc 1% single-use eye drops
 43 | 1140923348,co-proxamol
 44 | 1140878304,e45 cream
 45 | 1140910640,luteine
 46 | 1140917056,kliofem tablet
 47 | 1140871688,solpadol caplet
 48 | 1140856342,syndol tablet
 49 | 1140872112,epanutin 25mg capsule
 50 | 1140867504,priadel 200mg m/r tablet
 51 | 1140881882,timoptol 0.25% eye drops
 52 | 1140923276,co-amilozide
 53 | 1140911638,kelp+garlic product
 54 | 1141168122,solpadol capsule
 55 | 1140864196,climagest 1mg tablet
 56 | 1140911640,lecithin product
 57 | 1141184726,xalacom 0.005%/0.5% eye drops
 58 | 1140882776,fucibet cream
 59 | 1140865414,peppermint oil product
 60 | 1140883968,carmellose
 61 | 1140858452,hepacon b12 1mg/1ml injection
 62 | 1140871680,tylex capsule
 63 | 1140881474,normacol granules
 64 | 1140882618,diprosalic ointment
 65 | 1140868458,hormonin tablet
 66 | 1140869324,loestrin 20 tablet
 67 | 1140878324,oilatum cream
 68 | 1140868518,nuvelle tablet
 69 | 1140878186,liquifilm tears 1.4% eye drops
 70 | 1141178052,zapain caplet
 71 | 1140869176,logynon tablet
 72 | 1140926430,climesse tablet
 73 | 1140869346,cilest tablet
 74 | 1140875632,movelat gel
 75 | 1140922806,premique cycle 10mg tablet
 76 | 1140872036,paramax tablet
 77 | 1140870488,forceval capsule
 78 | 1140910698,oil of peppermint
 79 | 1141172686,coaprovel 150mg/12.5mg tablet
 80 | 1140927320,dermol 500 lotion
 81 | 1140869164,mercilon tablet
 82 | 1140882626,betnesol 0.1% eye/ear/nose drops
 83 | 1140872338,madopar 62.5 capsule
 84 | 1141172436,indivina 1mg/2.5mg tablet
 85 | 1140862526,sodium cromoglycate
 86 | 1140912212,menophase tablet
 87 | 1141167206,oestrogel 0.06% gel
 88 | 1141167848,asasantin retard m/r capsule
 89 | 1140878184,sno-tears eye drops
 90 | 1141168650,solpadeine capsule
 91 | 1140864070,kapake tablet
 92 | 1141188210,berocca effervescent tablet
 93 | 1140882112,co-careldopa
 94 | 1140869162,marvelon tablet
 95 | 1141187304,codipar caplet
 96 | 1141188836,felendil xl 5mg m/r tablet
 97 | 1140911636,kalms tablet
 98 | 1140878498,polytar liquid
 99 | 1140888578,antihypertensive
100 | 1141185986,cetraben emollient cream
101 | 1140876384,glandosane plain spray
102 | 1140921088,tridestra tablet
103 | 1141202030,estradot 25micrograms patch
104 | 1141173872,cetraben cream
105 | 1140864618,zestoretic 10 tablet
106 | 1141168648,solpadeine tablet
107 | 1140917450,oestrogel 1.25g gel
108 | 1140875630,movelat cream
109 | 1140882374,co-amoxiclav
110 | 1140868538,sustanon 100 oily injection
111 | 1140865762,regulan 3.6g/sachet powder
112 | 1141195836,dermol cream
113 | 1140888432,potassium product
114 | 1140878248,oilatum emollient bath additive
115 | 1140873780,co-trimoxazole
116 | 1140881414,gastrocote liquid
117 | 1141179824,yasmin tablet
118 | 1140868520,estracombi tts patch
119 | 1140878190,minims artificial tears single-use eye drops
120 | 1140876006,polyvinyl alcohol 1% eye drops
121 | 1140869186,ovranette tablet
122 | 1141189134,stalevo 50mg / 12.5mg / 200mg tablet
123 | 1141165512,kapake capsule
124 | 1140878286,diprobase ointment
125 | 1140868514,trisequens tablet
126 | 1141200400,amlostin 5mg tablet
127 | 1140871996,sanomigran 500micrograms tablet
128 | 1140865686,dulco-lax 5mg e/c tablet
129 | 1140871682,solpadol effervescent tablet
130 | 1141167140,exorex lotion
131 | 1140882542,fucidin cream
132 | 1140928880,geltears gel
133 | 1140876312,emulsifying ointment bp
134 | 1140870800,sandocal 400 effervescent tablet
135 | 1140879482,antacid tablet
136 | 1140878308,aveeno cream
137 | 1140878236,calmurid cream
138 | 1141178054,zapain capsule
139 | 1140870284,prostap sr 3.75mg injection (pdr for recon)+diluent+kit
140 | 1140865418,mintec 0.2ml e/c capsule
141 | 1140865658,lomotil tablet
142 | 1140880234,dianette tablet
143 | 1140869334,femodene tablet
144 | 1140865170,dried yeast 300mg tablet
145 | 1141163146,beclo-aqua 50 nasal spray
146 | 1140910566,glyclizide
147 | 1140875596,algesal cream
148 | 1140916342,beta-blocker
149 | 1140861884,maxepa 1g capsule
150 | 1140883162,combined oral contraceptive product
151 | 1141172628,almogran 12.5mg tablet
152 | 1141168374,dermol 200 shower emollient
153 | 1140867152,depixol 3mg tablet
154 | 1140882464,daktarin 2% cream
155 | 1140882110,co-beneldopa
156 | 1141186750,biotene oralbalance oral gel
157 | 1140866402,dyazide tablet
158 | 1140917128,imedeen tablet
159 | 1141187790,micardisplus 40mg/12.5mg tablet
160 | 1140876424,cocois ointment
161 | 1141200878,oilatum bath formula liquid bath additive
162 | 1140925778,paracodol capsule
163 | 1140881334,co-phenotrope
164 | 1140865840,predfoam 20mg enema
165 | 1140871684,remedeine tablet
166 | 1140876394,salivix pastille
167 | 1140860838,gtn 300micrograms sublingual tablet
168 | 1140874794,betnesol 500mcg soluble tablet
169 | 1140883014,ortho-dienoestrol 0.01% cream
170 | 1140877600,halibut-liver oil capsule
171 | 1140870486,folicin tablet
172 | 1140866420,moduretic tablet
173 | 1141164652,unguentum m cream
174 | 1140856442,solpadeine soluble effervescent tablet
175 | 1141166368,femodette tablet
176 | 1140869266,trinovum tablet
177 | 1140868794,sodium clodronate
178 | 1141185480,ispagel 3.5g/sachet s/f powder
179 | 1140876026,simple eye ointment
180 | 1140869256,brevinor tablet
181 | 1141189064,solpadeine plus soluble effervescent tablet
182 | 1140865366,maalox plus suspension
183 | 1140857200,calcium sulphaloxate
184 | 1140865380,kolanticon gel
185 | 1141173444,dulco-lax 2.5mg perles
186 | 1140910730,cromoglycate
187 | 1140864346,cocois scalp ointment
188 | 1140860324,tenoret 50 tablet
189 | 1140863552,paramol tablet
190 | 1140888644,emollient product
191 | 1140882778,lotriderm cream
192 | 1141181882,betaferon 300micrograms injection (pdr for recon)+diluent
193 | 1140865808,lactugal solution
194 | 1141193170,eccoxolac 300mg capsule
195 | 1140878052,fml eye drops
196 | 1140871174,voltarol 100mg suppository
197 | 1140867952,fluanxol 500micrograms tablet
198 | 1140882976,growth hormone product
199 | 1140877890,transvasin cream
200 | 1140871686,remedeine forte tablet
201 | 1140871162,vitamins capsule bpc
202 | 1140868804,bonefos 400mg capsule
203 | 1140866352,navispare tablet
204 | 1140881622,co-danthramer
205 | 1140881318,co-magaldrox
206 | 1140923272,co-triamterzide
207 | 1140888628,hydrocortistab 1% cream
208 | 1140861922,lipid lowering drug
209 | 1140926686,femapak 40 patch+tablet
210 | 1140881324,magnesium trisilicate
211 | 1140871920,dhc continus 60mg m/r tablet
212 | 1140860784,innozide tablet
213 | 1140860358,tenif capsule
214 | 1141189008,solpadeine plus capsule
215 | 1140880726,hirudoid cream
216 | 1140878170,hypotonic artificial tears eye drops
217 | 1140874950,prednesol 5mg tablet
218 | 1140873930,septrin 480mg tablet
219 | 1141179944,galpharm hayfever and allergy relief 10mg tablet
220 | 1141167748,solpadeine max tablet
221 | 1141167678,pharmaton capsule
222 | 1140881422,asilone liquid
223 | 1140872798,augmentin 375mg tablet
224 | 1140871004,vitamins b+c
225 | 1140860328,tenoretic tablet
226 | 1141180766,novofem tablet
227 | 1140888630,hydrocortisyl 1% cream
228 | 1140880056,sulphur product
229 | 1140861416,paroven 250mg capsule
230 | 1141189010,solpadeine plus tablet
231 | 1140888456,dioralyte product
232 | 1140869190,trinordiol tablet
233 | 1140856348,veganin tablet
234 | 1141188766,duac once daily gel
235 | 1140881714,capozide tablet
236 | 1140878182,hypotears eye drops
237 | 1140876350,cod liver oil+zinc oxide 11.4/38% ointment
238 | 1140872734,augmentin 625mg tablet
239 | 1140861418,oxerutins 250mg capsule
240 | 1140910734,disodium cromoglycate
241 | 1140878288,unguentum merck cream
242 | 1140878280,ultrabase cream
243 | 1141145830,fybozest orange 3.5g s/f granules
244 | 1140911760,beechams powder
245 | 1140870900,phosphate-sandoz tablet
246 | 1140856436,propain tablet
247 | 1141190656,kapake 30/500 effervescent tablet
248 | 1140925930,movelat relief gel
249 | 1140916948,e45 lotion
250 | 1140878350,carbo-dome cream
251 | 1140869262,ovysmen tablet
252 | 1140869260,norimin tablet
253 | 1140869254,binovum tablet
254 | 1140865358,maalox tablet
255 | 1140856336,codis dispersible tablet
256 | 1141200882,oilatum bath formula liquid bath additive 300ml
257 | 1141193140,evening primrose oil 20% cream
258 | 1141185108,aveeno lotion
259 | 1141172966,propain caplet
260 | 1140882544,fucidin ointment
261 | 1140878222,artificial saliva
262 | 1140870786,ketovite tablet
263 | 1140868260,distalgesic tablet
264 | 1140867342,clopixol 2mg tablet
265 | 1140866318,spirolone 25mg tablet
266 | 1140865724,sodium picosulphate
267 | 1141172758,as saliva orthana spray
268 | 1140926626,saliveze spray
269 | 1140925936,movelat relief cream
270 | 1140917114,alka-seltzer tablet
271 | 1140876498,soya oil 84.75% bath oil
272 | 1140870840,solvazinc 200mg effervescent tablet
273 | 1140865368,mucogel suspension
274 | 1140860398,kalten capsule
275 | 1191,indigestion remedy (over the counter)
276 | 1141194228,numark hayfever and allergy relief 10mg tablet
277 | 1140881624,co-danthrusate
278 | 1140870328,pregaday tablet
279 | 1140866416,moduret 25 tablet
280 | 1141191198,optrex eye drops
281 | 1141186902,bioxtra oral gel
282 | 1141175756,migramax sachet powder
283 | 1141162940,e45 emollient bath oil
284 | 1140910728,clodronate disodium
285 | 1140878426,polytar emollient bath additive
286 | 1140875628,axsain cream
287 | 1140870102,interferons
288 | 1140856454,df118 30mg tablet
289 | 1140852996,metatone tonic
290 | 1141200092,eflornitine 11.5% cream
291 | 1141188530,crampex tablet
292 | 1141172224,acidex oral suspension
293 | 1141162982,muse 125micrograms pellet
294 | 1140884298,ethambutolol
295 | 1140868258,aspav dispersible tablet
296 | 1141146084,oralbalance oral gel
297 | 1140911574,saline 0.9% nose drops
298 | 1140910614,prindolol
299 | 1140878258,diprobath bath additive
300 | 1140873818,colomycin 1million units injection (pdr for recon)
301 | 1141165476,triapin mite 2.5mg/2.5mg tablet
302 | 1140917062,natrasleep tablet
303 | 1140910552,folate product
304 | 1140909938,coc - combined oral contraceptives
305 | 1140881894,diuretic
306 | 1140876164,dexa-rhinaspray nasal spray
307 | 1141195474,zerobase cream
308 | 1141191774,optrex allergy eye drops
309 | 1140928346,gastrocote s/f liquid
310 | 1140923344,co-codaprin
311 | 1140910794,picosulphate
312 | 1140881418,premiums tablet
313 | 1140871926,dhc continus 120mg m/r tablet
314 | 1140870796,calcium-sandoz syrup
315 | 1140865370,topal tablet
316 | 1140856340,solpadeine forte dispersible tablet
317 | 1141195844,dermol cream 500g
318 | 1141175766,co-cyprindiol
319 | 1141156858,domperamol tablet
320 | 1141146044,laxoberal 5mg/5ml liquid
321 | 1140921988,tylex effervescent soluble tablet
322 | 1140910572,nitroglycerol
323 | 1140882422,co-fluampicil
324 | 1140876496,soya oil+mixed lauromacrogols 82.95/15% bath oil
325 | 1140873934,septrin forte 960mg tablet
326 | 1140862960,visclair 100mg tablet
327 | 1140860736,accuretic tablet
328 | 1141192638,sebco ointment
329 | 1141189626,liquivisc 0.25% eye gel
330 | 1141188502,pollenase 50micrograms nasal spray
331 | 1141179954,care hayfever relief 50micrograms nasal spray
332 | 1141165754,librofem 200mg tablet
333 | 1140926516,optrex dry eye therapy eye drops
334 | 1140923282,co-flumactone
335 | 1140911560,nambumetone
336 | 1140882468,daktarin 25mg/ml oral gel
337 | 1140878488,t/gel shampoo
338 | 1140878194,lubrifilm eye ointment
339 | 1140877866,ibrufhalal 200mg tablet
340 | 1140876314,hydrous ointment bp
341 | 1140876008,hydroxyethylcellulose
342 | 1140871830,diconal tablet
343 | 1140866396,aldactide 25 tablet
344 | 1140863440,meprate 400mg tablet
345 | 1141189680,ultramol capsule
346 | 1141189572,nasivin 0.05% nasal spray
347 | 1141188900,sulazine ec 500mg e/c tablet
348 | 1141157388,covonia bronchial balsam syrup
349 | 1140911756,askit powder
350 | 1140911678,waterfall tablet
351 | 1140880314,daktarin 0.16% powder spray
352 | 1140876608,feroglobin b12 syrup
353 | 1140876118,cerumol ear drops
354 | 1140868800,loron 400mg capsule
355 | 1140866412,lasilactone capsule
356 | 1140852908,multivite pellet
357 | 1140851150,laxoberal 5mg/5ml elixir
358 | 1207,unknown supplement
359 | 1141189598,adult meltus dry coughs with congestion oral liquid
360 | 1141184672,e45 itch relief cream
361 | 1141166848,dexa-rhinaspray duo aqueous nasal spray
362 | 1141163656,oculotect 5% single-use eye drops
363 | 1140925942,caprin 75mg e/c tablet
364 | 1140910754,polyacrylic acid
365 | 1140909910,aveeno bath oil
366 | 1140882768,vista-methasone n eye/ear/nose drops
367 | 1140879466,antacid liquid
368 | 1140878624,collodion product
369 | 1140878254,bath e45 bath oil
370 | 1140878224,lipobase cream
371 | 1140878216,orabase oral paste
372 | 1140876388,saliva orthana spray
373 | 1140874140,ciproxin 250mg tablet
374 | 1140869340,triadene tablet
375 | 1140868082,valoid 50mg tablet
376 | 1140866408,frusene tablet
377 | 1140865228,topical anti-inflammatory prep[1]
378 | 1140862912,simple linctus
379 | 1140857716,estrovis 4mg tablet
380 | 1140852970,vita-e 75iu tablet
381 | 1141195842,dermol cream 100g
382 | 1141181186,co-zidocapt 25mg/12.5mg tablet
383 | 1141173798,aerodiol 150micrograms nasal spray
384 | 1141172704,trizivir tablet
385 | 1141172298,sst tablet
386 | 1141157480,co-magaldrox product
387 | 1141151346,ciproxin 100mg tablet
388 | 1140911816,strepsils lozenge
389 | 1140910558,fusidate sodium
390 | 1140910416,ung emuls - ungentum emulsificans
391 | 1140882624,vista-methasone 0.1% eye/ear/nose drops
392 | 1140882018,aller-eze tablet
393 | 1140880336,colomycin topical powder
394 | 1140880324,nystaform cream
395 | 1140874084,rifinah 300 tablet
396 | 1140870596,slow sodium 600mg m/r tablet
397 | 1140869414,cystopurin 3g/sachet granules
398 | 1140868972,danol 100mg capsule
399 | 1140868286,paracodol soluble tablet
400 | 1140865552,liquorice
401 | 1140861410,opilon 40mg tablet
402 | 1140854254,dermacare cream 100ml
403 | 1140852900,juvel tablet
404 | 1140850748,mucogel tablet
405 | 1141200880,oilatum bath formula liquid bath additive 150ml
406 | 1141200726,lisicostad hct 10/12.5mg tablet
407 | 1141193276,pepcidtwo chewable indigestion tablet
408 | 1141192718,galpharm non-drowsy allergy relief 10mg tablet
409 | 1141189606,eumobase cream
410 | 1141188790,peptobismol 1.752% suspension
411 | 1141181398,liposic eye gel
412 | 1141173536,gelclair 15ml/sachet oral gel
413 | 1141168824,heliclear triple pack
414 | 1141167990,oilatum fragrance free liquid bath additive
415 | 1140923850,betaferon 9.6 million iu injection (pdr for recon)+diluent
416 | 1140909904,tri-iodothyronine product
417 | 1140888666,nitrate vasodilator
418 | 1140883818,vaseline dermacare cream
419 | 1140882998,miscellaneous cystitis remedies
420 | 1140882964,oral hypoglycaemic
421 | 1140881342,sodium acid phosphate
422 | 1140879396,codalax liquid
423 | 1140876260,bonjela 8.7% oral gel
424 | 1140873932,septrin 480mg dispersible tablet
425 | 1140873830,fucidin 250mg tablet
426 | 1140872808,augmentin 125mg/31mg/5ml s/f suspension
427 | 1140869174,eugynon 30 tablet
428 | 1140865832,predenema 20mg/100ml standard tube retention enema
429 | 1140865758,isogel granules
430 | 1140865702,normax capsule
431 | 1140860330,tolerzide tablet
432 | 1140856428,paramol 10/500mg tablet
433 | 1140851272,anacal suppository
434 | 1140851218,anacal ointment
435 | 1141194948,vantage pharmacy sleep aid 50mg tablet
436 | 1141193808,colomycin 2million units injection (pdr for recon)
437 | 1141189678,ultramol tablet
438 | 1141188180,beechams all in one syrup
439 | 1141175200,medocodene 30/500 capsule
440 | 1141174552,tobradex eye drops
441 | 1141173574,night nurse capsule
442 | 1141168116,refresh 1.4% ophthalmic solution
443 | 1141157476,co-beneldopa product
444 | 1141157472,calcium polystyrene sulphonate product
445 | 1141152366,daktarin dual action 2% cream
446 | 1140927388,remedeine effervescent tablet
447 | 1140927384,remedeine forte effervescent tablet
448 | 1140927202,mxl 90mg m/r capsule
449 | 1140916984,emulsifying soap
450 | 1140910564,glybenclamide
451 | 1140910026,zinc+castor oil cream bp
452 | 1140909918,biosynthetic human growth hormone
453 | 1140909470,otex ear drops
454 | 1140880316,daktarin 2% dusting powder
455 | 1140879426,antiemetic
456 | 1140879390,co-simalcite
457 | 1140878306,calendolon ointment
458 | 1140878282,k/l dry skin cream
459 | 1140878026,ralgex spray
460 | 1140874146,ciproxin 500mg tablet
461 | 1140874082,rifinah 150 tablet
462 | 1140874080,rifater tablet
463 | 1140871228,fenopron 300mg tablet
464 | 1140869264,synphase tablet
465 | 1140868470,estrapak 50micrograms/1mg patch+tablet
466 | 1140867860,faverin 50mg tablet
467 | 1140865822,anacal rectal ointment
468 | 1140865294,gastron tablet
469 | 1140864536,df118 forte 40mg tablet
470 | 1140864408,fleet enema
471 | 1140863028,welldorm tablet
472 | 1140860334,trasidrex tablet
473 | 1140857990,minovlar tablet
474 | 1140855838,evacalm 2mg tablet
475 | 1140855520,duo-autohaler inhaler
476 | 1140855332,iso-autohaler 80micrograms inhaler
477 | 1140851692,capozide 50mg tablets x28
478 | 1141201322,dermablend leg and body natural cover cream
479 | 1141200458,denzapine 25mg tablet
480 | 1141193146,gammaderm cream
481 | 1141192286,aller-eze 0.05% eye drops
482 | 1141190006,propain plus caplet
483 | 1141184648,human luteinising hormone product
484 | 1141182650,day and night nurse capsule
485 | 1141173902,vivioptal capsule
486 | 1141173572,night nurse oral solution
487 | 1141145658,angiotensin ii receptor antagonist
488 | 1140927338,compound coconut ointment
489 | 1140927204,mxl 120mg m/r capsule
490 | 1140923682,tricalcium phosphate 3.3g/sachet powder
491 | 1140923648,methodex 1mg/1ml mixture
492 | 1140923404,co-prenozide
493 | 1140917452,metazem 60mg m/r tablet
494 | 1140916354,ailax suspension
495 | 1140911598,saline 0.9% topical solution
496 | 1140910776,cantassium vitamin b6 50mg tablet
497 | 1140910726,disodium clodronate
498 | 1140910674,ethinylnortestosterone
499 | 1140910634,deltahydrocortisone
500 | 1140909722,amipramizide
501 | 1140888766,nacl - sodium chloride
502 | 1140888460,electrolade product
503 | 1140884422,dipipanone
504 | 1140884140,daktarin powder
505 | 1140883810,locobase cream
506 | 1140882546,fucidin gel
507 | 1140882376,sulpitil 200mg tablet
508 | 1140882276,enterosan tablet
509 | 1140881416,mucaine suspension
510 | 1140880268,alcoholic coal tar extract 5% shampoo
511 | 1140880154,bromine complexes
512 | 1140880058,sulphur+salicylic acid cream bp
513 | 1140879506,effico tonic
514 | 1140879412,phosphate enema
515 | 1140878664,mycota powder
516 | 1140878610,mycota cream
517 | 1140878262,zeasorb dusting powder
518 | 1140878228,vita-e ointment
519 | 1140877696,abidec drops
520 | 1140876646,calcium polystyrene sulphonate
521 | 1140876318,hydrous wool fat ointment bp
522 | 1140872918,magnapen capsule
523 | 1140872816,augmentin 1.2g injection (pdr for recon)
524 | 1140872802,augmentin 375mg s/f dispersible tablet
525 | 1140872032,migravess effervescent tablet
526 | 1140871924,dhc continus 90mg m/r tablet
527 | 1140871028,becosym tablet
528 | 1140869282,noristerat 200mg/1ml oily injection
529 | 1140868090,dimenhydrinate
530 | 1140865894,dulco-lax 5mg paediatric suppository
531 | 1140864176,monozide 10 tablet
532 | 1140861568,minihep calcium 5000iu/0.2ml injection
533 | 1140860406,moducren tablet
534 | 1140860338,viskaldix tablet
535 | 1140856806,amoxidin 500mg capsule
536 | 1140856418,panadeine forte tablet
537 | 1140856416,panadeine tablet
538 | 1140856406,medocodeine tablet
539 | 1140856332,antoin dispersible tablet
540 | 1140856312,claradin 300mg tablet
541 | 1140855816,congesteze 120mg/1mg tablet
542 | 1140852894,dalivit capsule
543 | 1141200876,oilatum junior bath formula liquid bath additive 300ml
544 | 1141200872,oilatum junior bath formula liquid bath additive
545 | 1141200736,galpharm flu strength all in one s/f oral solution
546 | 1141200470,acnocin 2000/35 tablet
547 | 1141200110,galsud 30mg/5ml linctus
548 | 1141199942,tilolec 100mg/25mg m/r tablet
549 | 1141199916,galpharm heartburn relief 10mg e/c tablet
550 | 1141195034,salinum sugar free oral solution
551 | 1141194946,oilatum fragrance free junior liquid bath additive
552 | 1141193210,day nurse capsule
553 | 1141191194,witch hazel product
554 | 1141190152,dymotil tablet
555 | 1141189772,gonapeptyl depot 3.75mg inj (pdr for recon)+solv p/f syringe
556 | 1141188676,carglutamic acid
557 | 1141188504,pollenase allergy 2% eye drops
558 | 1141186802,viraferonpeg 150mcg pdr+solv for soln for inj prefilled pen
559 | 1141186800,viraferonpeg 120mcg pdr+solv for soln for inj prefilled pen
560 | 1141186794,viraferonpeg 80mcg pdr+solv for soln for inj prefilled pen
561 | 1141184174,clarithrom tab+lansopraz cap+metronidaz tab 500/30/400mg pck
562 | 1141176172,silgel cream
563 | 1141173956,oilatum junior cream
564 | 1141170516,daktarin dual action 0.16% powder spray
565 | 1141168848,rinstead contact pastille
566 | 1141164618,ketil 2.5% gel
567 | 1141163094,dayleve 0.1% cream
568 | 1141157438,ciproxin 5g/100ml oral suspension
569 | 1141152070,ampitrin 125mg/5ml oral suspension
570 | 1141150478,decubal cream
571 | 1141150430,dermacare cream 150ml
572 | 1140928624,frusemek 5mg/40mg tablet
573 | 1140928266,solpaflex tablet
574 | 1140928260,panadeine co tablet
575 | 1140927624,lassar's paste
576 | 1140926360,alphaparin 3000iu/0.3ml prefilled syringe
577 | 1140923752,meronem 1g infusion kit
578 | 1140922936,enlive
579 | 1140922344,dermamist spray
580 | 1140921652,levorphanol
581 | 1140913318,rhdnase
582 | 1140913038,colomycin topical powder 1g
583 | 1140910988,vita-e cream
584 | 1140910818,butamidum
585 | 1140910802,androstanazol
586 | 1140910780,cantassium vitamin e 200iu capsule
587 | 1140910664,benzoxazocine
588 | 1140910644,fenopraine
589 | 1140910642,diprazinum
590 | 1140910620,aldadiene potassium
591 | 1140910602,amidine
592 | 1140910428,sa - salicylic acid
593 | 1140910370,meclastine
594 | 1140909880,hydroxycholecalciferol
595 | 1140909734,fortespan spansule
596 | 1140909428,earex ear drops
597 | 1140888874,salivace sugar free spray
598 | 1140888462,gluco-lyte
599 | 1140883528,epifrin 1% eye drops
600 | 1140882220,paracets 500mg capsule
601 | 1140882146,uniflu plus gregovite c tablet
602 | 1140882106,femigraine tablet
603 | 1140881412,algicon suspension
604 | 1140880458,zinc undecenoate+undecenoic acid 20/5% cream
605 | 1140880166,podophyllum resin
606 | 1140880018,zinc+salicylic acid paste bp
607 | 1140879930,strong coal tar solution+pine tar 5/5% gel
608 | 1140879674,pipothiazine
609 | 1140878608,monphytol paint
610 | 1140878586,calmurid solution
611 | 1140878316,lacticare lotion
612 | 1140878312,kamillosan ointment
613 | 1140878300,alcoderm cream
614 | 1140878242,aveeno oilated bath additive
615 | 1140877744,steripod blue topical liquid
616 | 1140877706,minadex sugar-free oral drops
617 | 1140876422,soap substitute+zinc oxide 5% cream wash
618 | 1140876338,zinc+castor oil ointment bp
619 | 1140876336,zinc 15% ointment bp
620 | 1140876330,flexible collodion bp
621 | 1140876324,simple ointment bp
622 | 1140876316,hydrous wool fat bp
623 | 1140876214,tyrocane lozenge
624 | 1140875594,white liniment bp
625 | 1140874954,hydrocortistab 20mg tablet
626 | 1140874410,fansidar tablet
627 | 1140874116,metrozol 500mg/100ml infusion
628 | 1140874000,zinamide 500mg tablet
629 | 1140873936,septrin adult suspension
630 | 1140873834,fucidin 500mg i-v infusion+buffer
631 | 1140873812,colomycin 1.5million units tablet
632 | 1140873798,bactrim 480mg tablet
633 | 1140873696,erythromid ds 500mg e/c tablet
634 | 1140873694,erythromid 250mg e/c tablet
635 | 1140872976,rimoxallin 500mg capsule
636 | 1140872826,augmentin 250mg/62mg/5ml s/f suspension
637 | 1140871066,gentian alkaline mixture
638 | 1140871034,vigranon b syrup
639 | 1140870492,octovit tablet
640 | 1140870480,fefol-vit spansule
641 | 1140870310,ferfolic sv tablet
642 | 1140870308,fefol spansule
643 | 1140870104,introna-2b 3million iu injection (pdr for recon)+diluent
644 | 1140869338,tri-minulet tablet
645 | 1140869272,neogest tablet
646 | 1140869258,neocon 1/35 tablet
647 | 1140869188,schering pc4 tablet
648 | 1140869184,ovran 30 tablet
649 | 1140869032,dienoestrol
650 | 1140868512,syntex menophase tablet
651 | 1140868280,cosalgesic tablet
652 | 1140868076,cinaziere 15mg tablet
653 | 1140867988,dramamine 50mg tablet
654 | 1140867734,concordin 5mg tablet
655 | 1140866692,beta-adrenoceptor blocking drug
656 | 1140866442,diumide-k continus m/r tablet
657 | 1140866418,fru-co tablet
658 | 1140866410,kalspare tablet
659 | 1140866404,dytide capsule
660 | 1140866400,amil-co tablet
661 | 1140866328,triam-co tablet
662 | 1140866008,rowachol capsule
663 | 1140865760,metamucil powder
664 | 1140865548,pyrogastrone tablet
665 | 1140865478,tripotassium dicitratobismuthate 120mg tablet
666 | 1140864808,tropergen tablet
667 | 1140864562,introna 25million iu/5ml injection solution
668 | 1140864502,testotop tts 15mg transdermal patch
669 | 1140863034,chlormethiazole
670 | 1140862124,exirel 200micrograms inhaler
671 | 1140861776,antiplatelet drug
672 | 1140861444,saventrine 30mg tablet
673 | 1140860410,prestim tablet
674 | 1140860348,atenixco 50mg/12.5mg tablet
675 | 1140859776,pernomol paint
676 | 1140859282,nitrophenol
677 | 1140858378,ironorm capsule
678 | 1140858324,medroxyprogest 80mg/ml suspension 100ml
679 | 1140858310,gastrovite tablet
680 | 1140858306,fesovit-z m/r capsule
681 | 1140857920,minilyn tablet
682 | 1140857628,gestone 10mg/1ml injection
683 | 1140857198,septrin 960mg/3ml intramuscular injection
684 | 1140856754,ciclacillin
685 | 1140856456,df118 10mg/5ml elixir
686 | 1140856422,paradeine tablet
687 | 1140856410,neurodyne capsule
688 | 1140856214,solprin 300mg dispersible tablet
689 | 1140856114,durophet 7.5mg m/r capsule
690 | 1140856040,methyprylone
691 | 1140855890,dormonoct 1mg tablet
692 | 1140855870,almazine 1mg tablet
693 | 1140855426,biophylline 350mg m/r tablet
694 | 1140854432,cortacream 1% band
695 | 1140854256,dermacare lotion 75ml
696 | 1140854112,merocaine lozenge
697 | 1140854000,hayphryn nasal spray
698 | 1140853986,neophryn 0.5% nasal spray
699 | 1140853676,opulets sodium chloride single-use eye drops 0.5ml
700 | 1140853440,lachesine chloride
701 | 1140852904,minamino compound syrup
702 | 1140852884,calcimax syrup
703 | 1140852876,tonivitan syrup
704 | 1140852872,lipotriad capsule
705 | 1140851360,brinaldix k tablet
706 | 1140851306,dehydrocholic acid
707 | 1140851278,betnovate compound suppository
708 | 1140851128,hamamelis 200mg suppository
709 | 1140851066,trifyba 250g powder
710 | 1140851064,lejfibre 10g biscuits
711 | 1140851062,fybranta 2g tablet
712 | 1140850932,bellocarb tablet
713 | 1140850720,gastrils 500mg green (mint) pastille
714 | 1140850714,droxalin tablet


--------------------------------------------------------------------------------