├── netwalk ├── __init__.py ├── utils.py ├── models.py ├── walk.py ├── walkdataset.py └── translator.py ├── Makefile ├── JuxtaposeTutorial ├── line.png ├── keypair.png ├── attachvolume.png ├── attachvolume2.png ├── keypairname.png ├── securitygroup.png ├── spotrequests.png ├── connectinstance.png ├── selectinstance.png └── Embedding_Methodology.png ├── requirements.txt ├── regression.py ├── test ├── data │ ├── line-config.json │ ├── cross-config.json │ ├── circle-config.json │ ├── prefrontal_cortex.json │ └── brain-heart-config.json ├── test_utils.py ├── test_walkdataset.py ├── test_walk.py └── test_temp.py ├── Line └── IDConvertor.json ├── make_directed.py ├── network_stats.py ├── test_translator.py ├── find_common_genes.py ├── random_tree_generator.py ├── dataset_generator.py ├── similarity.py ├── dangle.py ├── dimensionality_reduction.py ├── README.md └── runner.py /netwalk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | setup: 2 | pip install -r requirements.txt 3 | regression: 4 | python regression.py -------------------------------------------------------------------------------- /JuxtaposeTutorial/line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/line.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/keypair.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/keypair.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/attachvolume.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/attachvolume.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/attachvolume2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/attachvolume2.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/keypairname.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/keypairname.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/securitygroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/securitygroup.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/spotrequests.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/spotrequests.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/connectinstance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/connectinstance.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/selectinstance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/selectinstance.png -------------------------------------------------------------------------------- /JuxtaposeTutorial/Embedding_Methodology.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/klovens/juxtapose/HEAD/JuxtaposeTutorial/Embedding_Methodology.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gensim==3.8.3 2 | matplotlib==3.1.3 3 | networkx==2.4 4 | numpy==1.18.1 5 | pandas==1.0.1 6 | scikit-learn==0.22.1 7 | scipy==1.4.1 8 | seaborn==0.10.0 9 | sklearn==0.0 10 | torch==1.6.0 11 | torchvision==0.7.0 -------------------------------------------------------------------------------- /regression.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | # initialize the test suite 5 | loader = unittest.TestLoader() 6 | start_dir = './test/' 7 | suite = loader.discover(start_dir) 8 | 9 | # initialize a runner, pass it your suite and run it 10 | runner = unittest.TextTestRunner(verbosity=3) 11 | result = runner.run(suite) -------------------------------------------------------------------------------- /test/data/line-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_replicates": 10, 3 | "percentage": 0.4, 4 | "n_anchors": 6, 5 | "anchor_test_ratio": 0.5, 6 | "min_dangle_size": 3, 7 | "max_dangle_size": 10, 8 | "anchor_file_address": "test/data/line_anchors.csv", 9 | "phenotypes": ["1", "2"], 10 | "experiment_name": "Line", 11 | "test_ratio": 0.5, 12 | "data_directory": "test/data" 13 | } 14 | -------------------------------------------------------------------------------- /test/data/cross-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_replicates": 1, 3 | "percentage": 0.4, 4 | "n_anchors": 10, 5 | "anchor_test_ratio": 0.5, 6 | "min_dangle_size": 3, 7 | "max_dangle_size": 5, 8 | "anchor_file_address": "test/data/cross_anchors.csv", 9 | "phenotypes": ["cross_1", "cross_2"], 10 | "experiment_name": "Cross", 11 | "test_ratio": 0.5, 12 | "data_directory": "test/data" 13 | } 14 | -------------------------------------------------------------------------------- /test/data/circle-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_replicates": 1, 3 | "percentage": 0.4, 4 | "n_anchors": 10, 5 | "anchor_test_ratio": 0.5, 6 | "min_dangle_size": 3, 7 | "max_dangle_size": 5, 8 | "anchor_file_address": "test/data/circle_anchors.csv", 9 | "phenotypes": ["circle_1", "circle_2"], 10 | "experiment_name": "Circle", 11 | "test_ratio": 0.5, 12 | "data_directory": "test/data" 13 | } 14 | -------------------------------------------------------------------------------- /test/data/prefrontal_cortex.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_replicates": 1, 3 | "percentage": 0.3, 4 | "n_anchors": 20, 5 | "anchor_test_ratio": 0.2, 6 | "min_dangle_size": 10, 7 | "max_dangle_size": 15, 8 | "anchor_file_address": "data/common_cortex_genes.csv", 9 | "phenotypes": ["human", "chimpanzee", "macaque", "mouse"], 10 | "experiment_name": "Cortex", 11 | "test_ratio": 0.2, 12 | "data_directory": "pcortex_data" 13 | } 14 | -------------------------------------------------------------------------------- /test/data/brain-heart-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "n_replicates": 1, 3 | "percentage": 0.4, 4 | "n_anchors": 20, 5 | "anchor_test_ratio": 0.5, 6 | "min_dangle_size": 7, 7 | "max_dangle_size": 10, 8 | "anchor_file_address": "data/heart_brain_shared.csv", 9 | "phenotypes": ["brain_1", "brain_2", "brain_3", "heart_1", "heart_2", "heart_3"], 10 | "experiment_name": "Heart_Brain", 11 | "test_ratio": 0.5, 12 | "data_directory": "data" 13 | } 14 | -------------------------------------------------------------------------------- /Line/IDConvertor.json: -------------------------------------------------------------------------------- 1 | {"0": 0, "1": 1, "10": 2, "11": 3, "12": 4, "13": 5, "14": 6, "15": 7, "16": 8, "17": 9, "18": 10, "19": 11, "2": 12, "20": 13, "3": 14, "4": 15, "5": 16, "6": 17, "7": 18, "8": 19, "9": 20, "pseudo_14": 21, "pseudo_14_000": 22, "pseudo_14_001": 23, "pseudo_14_002": 24, "pseudo_15": 25, "pseudo_15_000": 26, "pseudo_15_001": 27, "pseudo_15_002": 28, "pseudo_15_003": 29, "pseudo_5": 30, "pseudo_5_000": 31, "pseudo_5_001": 32, "pseudo_5_002": 33, "pseudo_5_003": 34, "pseudo_5_004": 35, "pseudo_5_005": 36} -------------------------------------------------------------------------------- /make_directed.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | G = nx.read_edgelist("/home/farhad/Network/juxt/brain_heart_data/heart_1.csv", delimiter=',',nodetype=str, data=(('cor',float),)) 4 | #print(G.edges(data=True)) 5 | T = nx.algorithms.tree.mst.minimum_spanning_tree(G, weight='cor') 6 | edj = T.edges() 7 | print(edj) 8 | address = 'test/data/heart_directed_1.txt' 9 | with open(address, 'w') as fout: 10 | for e in edj: 11 | # write edges to file 12 | node_1 = e[0] 13 | node_2 = e[1] 14 | fout.write('{}\t{}\n'.format(str(node_1),str(node_2))) 15 | -------------------------------------------------------------------------------- /network_stats.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def alignment_permutation_test(vocab1_length, vocab2_length, distance, actual_score, num_iteration=1000): 5 | n = min(vocab1_length, vocab2_length) 6 | indices = list(range(n)) 7 | scores = [] 8 | for i in range(num_iteration): 9 | v1 = np.random.choice(indices, size=n) 10 | v2 = np.random.choice(indices, size=n) 11 | s = 0 12 | for i, j in zip(v1, v2): 13 | s += distance[i, j] 14 | scores.append(s/n) 15 | scores = np.array(scores) 16 | print(scores) 17 | p = sum(scores >= actual_score) / num_iteration 18 | return p 19 | 20 | 21 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from netwalk.utils import Vocabulary 3 | from netwalk.utils import load 4 | from netwalk.utils import Similarity 5 | 6 | class TestUtils(unittest.TestCase): 7 | def test_load(self): 8 | similarity = load("data/similarity_file.csv", sep=",") 9 | expected = {("gene1", "gene2"): 0.5, 10 | ("gene2", "gene3"): 0.7, 11 | ("gene1", "gene3"): 0.0} 12 | 13 | self.assertDictEqual(similarity, expected) 14 | 15 | def test_Vocabulary(self): 16 | genes = ['g0', 'g1', 'g2', 'g3', 'g4'] 17 | id_2_name_map = {0: 'g0', 1: 'g1', 2: 'g2', 3: 'g3', 4: 'g4'} 18 | name_2_id_map = {'g0': 0, 'g1': 1, 'g2': 2, 'g3': 3, 'g4': 4} 19 | vocab = Vocabulary(genes) 20 | self.assertDictEqual(vocab.index, name_2_id_map) 21 | self.assertDictEqual(vocab.name, id_2_name_map) 22 | self.assertListEqual(vocab.genes, genes) 23 | 24 | def test_Similarity(self): 25 | d = {("gene1", "gene2"): 0.5, 26 | ("gene2", "gene3"): 0.7, 27 | ("gene1", "gene3"): 0.6, 28 | ("gene1", "gene4"): 0.3} 29 | similarity = Similarity(d) 30 | symmetric_keys = similarity.symmetric_key_set() 31 | expected_sym_keys = [("gene1", "gene2"), ("gene2", "gene1"), ("gene2", "gene3"), ("gene3", "gene2"), 32 | ("gene1", "gene3"), ("gene3", "gene1"), ("gene1", "gene4"), ("gene4", "gene1")] 33 | 34 | assert set(expected_sym_keys) == set(symmetric_keys) 35 | -------------------------------------------------------------------------------- /test_translator.py: -------------------------------------------------------------------------------- 1 | from netwalk.translator import IDCovertor 2 | import os 3 | import argparse 4 | import json 5 | import glob 6 | 7 | if __name__ == '__main__': 8 | parser = argparse.ArgumentParser(description='Translate datasets') 9 | parser.add_argument('-c', '--config', metavar='JSON file path', 10 | action='store', required=True, 11 | help='Path to a config file') 12 | args = parser.parse_args() 13 | # read config file 14 | with open(args.config) as fin: 15 | params = json.load(fin) 16 | 17 | edge_list_file_addresses = glob.glob(os.path.join(params['experiment_name'], 18 | 'anchored_*.csv')) 19 | print(os.listdir(params['experiment_name'])) 20 | #output_file_address = 'Line/translated_line.csv' 21 | convertor = IDCovertor(edge_list_file_addresses, sep=',') 22 | for edge_list_file_address in edge_list_file_addresses: 23 | dir_path, file_name = os.path.split(edge_list_file_address) 24 | output_file_address = os.path.join(dir_path, f'translated_{file_name}') 25 | convertor.translate(edge_list_file_address, output_file_address, sep=',') 26 | 27 | convertor_file = os.path.join(params['experiment_name'], 28 | 'IDConvertor.json') 29 | convertor.save(convertor_file) 30 | con = IDCovertor.load(convertor_file) 31 | assert con.id2int == convertor.id2int 32 | assert con.int2id == convertor.int2id 33 | assert con.ids == convertor.ids 34 | 35 | -------------------------------------------------------------------------------- /netwalk/utils.py: -------------------------------------------------------------------------------- 1 | ''' This module contains utilities classes and functions. 2 | 3 | ''' 4 | import os.path 5 | import json 6 | import random 7 | import torch 8 | import copy 9 | import numpy as np 10 | 11 | class Vocabulary(object): 12 | '''Create a bijective mapping between gene name/ID and indices. 13 | 14 | Args: 15 | genes: An array-like containing gene names/IDs. 16 | ''' 17 | 18 | def __init__(self, genes): 19 | self.genes = genes 20 | self.index = dict(zip(sorted(genes), range(len(genes)))) 21 | self.name = {idx: gene for gene, idx in self.index.items()} 22 | self.dim = len(self.genes) 23 | 24 | def to_indices(self, genes): 25 | return [self.index[gene] for gene in genes] 26 | 27 | def to_names(self, indices): 28 | return [self.name[i] for i in indices] 29 | 30 | def __len__(self): 31 | return len(self.genes) 32 | 33 | 34 | def load_walks(file_dir='.', prefix='pair_walk', sep=','): 35 | ''' Read dataset from a file. 36 | 37 | Args: 38 | address: Address of a CSV file containing walks. 39 | sep: A field delimiter. 40 | Returns: 41 | data: A list of walks. 42 | genes: The set of all genes that appear in at least one walk. 43 | ''' 44 | walk_address = os.path.join(file_dir, prefix + '_walks.csv') 45 | walks = np.genfromtxt(walk_address, dtype=np.uint16, delimiter=sep) 46 | return walks 47 | 48 | 49 | def dump_walks(walks, out_dir='.', prefix='pair_walk', sep=','): 50 | # Create walks file 51 | pass 52 | 53 | 54 | -------------------------------------------------------------------------------- /test/test_walkdataset.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from netwalk.walkdataset import WalkDataset 3 | from netwalk.walkdataset import PairWalkDataset 4 | from netwalk.utils import Vocabulary 5 | 6 | 7 | class TestWalkDataset(unittest.TestCase): 8 | def setUp(self): 9 | data = [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 10 | [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 11 | [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], 12 | [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 13 | [40, 41, 42, 43, 44, 45, 46, 47, 48, 49]] 14 | self.data = data 15 | 16 | def test_from_csv(self): 17 | dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',') 18 | self.assertListEqual(dataset.walks, self.data) 19 | 20 | def test__len__(self): 21 | dataset = WalkDataset(original_walks=[], vocab=Vocabulary([])) 22 | self.assertEqual(len(dataset), 0) 23 | dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',') 24 | self.assertEqual(len(dataset), 5) 25 | 26 | def test__getitem__(self): 27 | dataset = WalkDataset.from_csv('data/sample_walk_dataset.csv', sep=',') 28 | for i, expected in enumerate(self.data): 29 | self.assertListEqual(expected, dataset[i]) 30 | 31 | 32 | class TestPairedWalkDataset(unittest.TestCase): 33 | def setUp(self): 34 | data = [([0, 1, 2, 3, 4], [0, 1, 2, 3, 4]), 35 | ([5, 6, 7, 8, 9], [5, 8, 7, 8, 9])] 36 | self.data = data 37 | 38 | def test_from_csv(self): 39 | dataset = PairWalkDataset.from_csv('data/sample_pair_walk_dataset.csv', sep=',') 40 | self.assertEqual(len(self.data), len(dataset)) 41 | for i, observed_walk in enumerate(dataset): 42 | (expected_walk_a, expected_walk_b) = self.data[i] 43 | observed_walk_a, observed_walk_b = observed_walk 44 | self.assertListEqual(expected_walk_a, observed_walk_a) 45 | self.assertListEqual(expected_walk_b, observed_walk_b) 46 | -------------------------------------------------------------------------------- /find_common_genes.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import multiprocessing as mp 3 | 4 | 5 | def load(address): 6 | genes = set() 7 | with open(address) as fin: 8 | for line in fin: 9 | line = line.strip() 10 | if line == '': 11 | continue 12 | gene_a, gene_b, _ = line.split(',') 13 | genes.add(gene_a) 14 | genes.add(gene_b) 15 | return genes 16 | 17 | def get_common_genes(edge_lists, genes_of_interest): 18 | pool = mp.Pool(min(mp.cpu_count(), len(edge_llists))) 19 | profiles_genes = [pool.apply(load, args=(address, )) for address in edge_lists] 20 | pool.close() 21 | common_genes = set(genes_of_interest) 22 | for genes in profiles_genes: 23 | common_genes = genes & common_genes 24 | return common_genes 25 | 26 | 27 | if __name__ == '__main__': 28 | output_address = 'data/common_cortex_genes.csv' 29 | edge_llists = [ 30 | 'pcortex_data/network_1_12_chimpanzee.csv', 31 | 'pcortex_data/network_1_12_human.csv', 32 | 'pcortex_data/network_1_12_macaque.csv', 33 | 'pcortex_data/network_1_12_mouse.csv'] 34 | #genes_of_interest = '/home/fam918/Documents/CodeRepos/WALKS/netwalk/data/homeostasis_genes.csv' 35 | #output_address = 'heart_brain_shared.csv' 36 | #edge_llists = ['/home/farhad/Network/netwalk/data/network_1_200_heart.csv', 37 | # '/home/farhad/Network/netwalk/data/network_2_200_heart.csv', 38 | # '/home/farhad/Network/netwalk/data/network_3_200_heart.csv', 39 | # '/home/farhad/Network/netwalk/data/network_1_200_brain.csv', 40 | # '/home/farhad/Network/netwalk/data/network_2_200_brain.csv', 41 | # '/home/farhad/Network/netwalk/data/network_3_200_brain.csv'] 42 | genes_of_interest = 'data/cellular_homeostasis.csv' 43 | with open(genes_of_interest) as f: 44 | lines = f.read().splitlines() 45 | common_genes = get_common_genes(edge_llists, lines) 46 | with open(output_address, 'w') as fout: 47 | for gene in common_genes: 48 | fout.write('{}\n'.format(gene)) 49 | -------------------------------------------------------------------------------- /random_tree_generator.py: -------------------------------------------------------------------------------- 1 | import random 2 | import networkx as nx 3 | 4 | START_SIZE=50 5 | CURRENT_SIZE=50 6 | FINAL_SIZE=200 7 | NODE_STEP=10 8 | 9 | while CURRENT_SIZE <= FINAL_SIZE: 10 | #print(CURRENT_SIZE - 1) 11 | address = 'test/data/random_tree_{}.csv'.format(CURRENT_SIZE) 12 | with open(address, 'w') as fout: 13 | if CURRENT_SIZE == START_SIZE: 14 | G = nx.generators.trees.random_tree(START_SIZE) 15 | while nx.number_connected_components(G) > 1: 16 | nx.generators.trees.random_tree(START_SIZE) 17 | edj = list(G.edges()) 18 | n = list(G.nodes()) 19 | for e in edj: 20 | # write edges to file 21 | fout.write('{},{},1\n'.format(str(e[0]),str(e[1]))) 22 | fout.write('{},{},1\n'.format(str(e[1]),str(e[0]))) 23 | elif CURRENT_SIZE != START_SIZE: 24 | # read in the previous graph and write it to file 25 | previous_address = 'test/data/random_tree_{}.csv'.format(CURRENT_SIZE-NODE_STEP) 26 | file_previous = open(previous_address, 'r') 27 | Lines = file_previous.readlines() 28 | fout.writelines(Lines) 29 | 30 | G = nx.generators.trees.random_tree(NODE_STEP) 31 | while nx.number_connected_components(G) > 1: 32 | nx.generators.trees.random_tree(NODE_STEP) 33 | edj = list(G.edges()) 34 | 35 | for e in edj: 36 | # write edges to file 37 | node_1 = e[0] + CURRENT_SIZE-NODE_STEP - 1 38 | node_2 = e[1] + CURRENT_SIZE- NODE_STEP - 1 39 | fout.write('{},{},1\n'.format(str(node_1),str(node_2))) 40 | fout.write('{},{},1\n'.format(str(node_2),str(node_1))) 41 | # connect a node in graph to a random node in the original graph 42 | rand_node = random.randint(0, CURRENT_SIZE-1-NODE_STEP) 43 | #print(rand_node) 44 | print(CURRENT_SIZE,rand_node,node_1) 45 | fout.write('{},{},1\n'.format(str(rand_node), str(node_1))) 46 | fout.write('{},{},1\n'.format(str(node_1), str(rand_node))) 47 | CURRENT_SIZE= CURRENT_SIZE + NODE_STEP 48 | 49 | 50 | address = 'test/data/test_PPI.txt' 51 | with open(address, 'w') as fout: 52 | G = nx.scale_free_graph(100) 53 | edj = list(G.edges()) 54 | 55 | for e in edj: 56 | # write edges to file 57 | node_1 = e[0] 58 | node_2 = e[1] 59 | fout.write('{}\t{}\t1\n'.format(str(node_1),str(node_2))) 60 | -------------------------------------------------------------------------------- /dataset_generator.py: -------------------------------------------------------------------------------- 1 | ''' This module generate walk datasets. 2 | ''' 3 | import argparse 4 | import random 5 | import numpy as np 6 | import netwalk.utils as utils 7 | import os.path 8 | from netwalk.walk import WalkGenerator 9 | from similarity import Similarity 10 | import gensim.models 11 | import time 12 | import multiprocessing as mp 13 | import json 14 | import pandas as pd 15 | import seaborn as sns 16 | import copy 17 | 18 | def generate_walks(edge_list_address, walk_per_node, walk_length, workers = 4): 19 | similarity = Similarity(correlation_file_path=edge_list_address, anchors=[], 20 | alphas=[], sep=',', prefix='pseudo') 21 | genes = list(similarity.idx.keys()) 22 | start_time = time.time() 23 | gen_walk = WalkGenerator(similarity.matrix, genes, walk_length, walk_per_node) 24 | print("takes {} seconds to create walk object.".format( 25 | time.time() - start_time)) 26 | 27 | num_cpus = workers 28 | pool = mp.Pool(num_cpus) 29 | arguments = list(range(len(gen_walk))) 30 | chunk_size = len(gen_walk) // num_cpus 31 | walks = pool.map(gen_walk, arguments, chunksize=chunk_size) 32 | return walks 33 | 34 | 35 | if __name__ == '__main__': 36 | parser = argparse.ArgumentParser(description='Generate datasets') 37 | parser.add_argument('-c', '--config', metavar='JSON file path', 38 | action='store', required=True, 39 | help='Path to a config file') 40 | args = parser.parse_args() 41 | # read config file 42 | with open(args.config) as fin: 43 | params = json.load(fin) 44 | # make walks and train the network 45 | for pheno in params['phenotypes']: 46 | for rep_id in range(params['n_replicates']): 47 | edge_list_address = os.path.join(params['experiment_name'], 48 | 'translated_anchored_{}_{}.csv'.format(pheno, 49 | str(rep_id))) 50 | # Create walks 51 | walks = generate_walks(edge_list_address, params['walk_per_node'], 52 | params['walk_length'], workers=params['n_workers']) 53 | 54 | # Write walks to file 55 | address = os.path.join(params['experiment_name'], 56 | '{}_{}_walks.csv'.format(pheno, str(rep_id))) 57 | with open(address, 'w') as fout: 58 | for w in walks: 59 | fout.write('{}\n'.format(','.join([str(s) for s in w]))) 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /test/test_walk.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from netwalk.walk import Walk 4 | from netwalk.walk import PairWalk 5 | 6 | 7 | class TestWalk(unittest.TestCase): 8 | def test_from_dict(self): 9 | d = {(1, 2): 0.7, 10 | (1, 3): 0.1, 11 | (2, 4): 0.5, 12 | (3, 4): 0.3} 13 | similarity = [[0, 7/8, 1/8, 0], 14 | [7/12, 0, 0, 5/12], 15 | [1/4, 0, 0, 3/4], 16 | [0, 5/8, 3/8, 0]] 17 | CDF = np.array([[0, 0.875, 1, 1], 18 | [7/12, 7/12, 7/12, 1], 19 | [0.25, 0.25, 0.25, 1], 20 | [0, 0.625, 1, 1]]) 21 | 22 | walk = Walk(d) 23 | self.assertListEqual(list(walk._nodes), [1, 2, 3, 4]) 24 | diff = np.array(similarity) - walk.prob 25 | self.assertAlmostEqual(np.linalg.norm(diff), 0) 26 | ids = np.array([walk._ids[node] for node in walk._nodes]) 27 | diff = np.linalg.norm(ids - np.arange(len(walk._nodes))) 28 | self.assertAlmostEqual(diff, 0) 29 | diff = np.linalg.norm(walk.cdf - CDF) 30 | self.assertAlmostEqual(diff, 0) 31 | 32 | def test_generate(self): 33 | similarity = {(1, 2): 0.5, (1, 3): 0.0} 34 | 35 | walk = Walk(similarity) 36 | self.assertListEqual(walk.generate(3, 3), [2, 2, 2, 2]) 37 | 38 | self.assertListEqual((walk.generate(1, 5)), [0, 1, 0, 1, 0, 1]) 39 | self.assertListEqual((walk.generate(2, 5)), [1, 0, 1, 0, 1, 0]) 40 | 41 | def test_make_walks(self): 42 | similarity = {(1, 2): 0.5, (1, 3): 0.0} 43 | walk = Walk(similarity) 44 | dataset = walk.make_walks(walk_per_node=2, walk_length=3) 45 | expected_dataset = [[0, 1, 0, 1], [0, 1, 0, 1], [1, 0, 1, 0], 46 | [1, 0, 1, 0], [2, 2, 2, 2], [2, 2, 2, 2]] 47 | for expected, observed in zip(expected_dataset, dataset['walks']): 48 | self.assertListEqual(expected, list(observed)) 49 | expected_nodes = [1, 2, 3] 50 | self.assertListEqual(expected_nodes, list(dataset['nodes'])) 51 | expected_ids = {1: 0, 2: 1, 3: 2} 52 | self.assertEqual(expected_ids, dataset['ids']) 53 | 54 | 55 | class TestPairWalk(unittest.TestCase): 56 | 57 | def test_generate(self): 58 | similarity = {(1, 2): 0.5, (1, 3): 0.0} 59 | walk = PairWalk(similarity) 60 | self.assertListEqual(walk.generate(3, 3), [2, 2, 2, 2, 2, 2, 2, 2]) 61 | self.assertListEqual((walk.generate(1, 5)), [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]) 62 | self.assertListEqual((walk.generate(2, 5)), [1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]) 63 | -------------------------------------------------------------------------------- /netwalk/models.py: -------------------------------------------------------------------------------- 1 | ''' This class contain the models. 2 | 3 | ''' 4 | import math 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.nn import TransformerEncoder, TransformerEncoderLayer 9 | 10 | 11 | class TransformerModel(nn.Module): 12 | 13 | def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5): 14 | super(TransformerModel, self).__init__() 15 | self.model_type = 'Transformer' 16 | self.src_mask = None 17 | self.pos_encoder = PositionalEncoding(ninp, dropout) 18 | encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout) 19 | self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers) 20 | self.encoder = nn.Embedding(ntoken, ninp) 21 | self.ninp = ninp 22 | self.decoder = nn.Linear(ninp, ntoken) 23 | self.init_weights() 24 | 25 | def _generate_square_subsequent_mask(self, sz): 26 | mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) 27 | mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) 28 | return mask 29 | 30 | def init_weights(self): 31 | initrange = 0.1 32 | self.encoder.weight.data.uniform_(-initrange, initrange) 33 | self.decoder.bias.data.zero_() 34 | self.decoder.weight.data.uniform_(-initrange, initrange) 35 | 36 | def forward(self, src): 37 | if self.src_mask is None or self.src_mask.size(0) != len(src): 38 | device = src.device 39 | mask = self._generate_square_subsequent_mask(len(src)).to(device) 40 | self.src_mask = mask 41 | src = self.encoder(src) * math.sqrt(self.ninp) 42 | src = self.pos_encoder(src) 43 | output = self.transformer_encoder(src, self.src_mask) 44 | output = self.decoder(output) 45 | return F.log_softmax(output, dim=-1) 46 | 47 | def embedding(self, src): 48 | model.eval() # Turn on the evaluation mode 49 | with torch.no_grad(): 50 | src = self.encoder(src) * math.sqrt(self.ninp) 51 | src = self.pos_encoder(src) 52 | output = self.transformer_encoder(src, None) 53 | return output 54 | 55 | def save(self, address): 56 | torch.save(self.state_dict(), address) 57 | 58 | @classmethod 59 | def load(cls, model, address): 60 | model.load_state_dict(torch.load(address)) 61 | model.to(device) 62 | return model 63 | 64 | 65 | class PositionalEncoding(nn.Module): 66 | ''' Positional encoding used in transforemers. 67 | ''' 68 | 69 | def __init__(self, d_model, dropout=0.1, max_len=5000): 70 | super(PositionalEncoding, self).__init__() 71 | self.dropout = nn.Dropout(p=dropout) 72 | 73 | pe = torch.zeros(max_len, d_model) 74 | position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) 75 | div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)) 76 | pe[:, 0::2] = torch.sin(position * div_term) 77 | pe[:, 1::2] = torch.cos(position * div_term) 78 | pe = pe.unsqueeze(0).transpose(0, 1) 79 | self.register_buffer('pe', pe) 80 | 81 | def forward(self, x): 82 | x = x + self.pe[:x.size(0), :] 83 | return self.dropout(x) 84 | -------------------------------------------------------------------------------- /similarity.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | 4 | 5 | class Similarity(object): 6 | def __init__(self, correlation_file_path, anchors, alphas, sep=',', 7 | prefix='pseudo', string_id=False): 8 | self.real_genes = set() 9 | with open(correlation_file_path) as fin: 10 | for line in fin: 11 | line = line.strip() 12 | if line == '': 13 | continue 14 | a, b, _ = line.split(sep) 15 | a = a.strip() 16 | b = b.strip() 17 | if string_id is False: 18 | a = int(a) 19 | b = int(b) 20 | self.real_genes.add(a) 21 | self.real_genes.add(b) 22 | self.real_genes = list(sorted(self.real_genes)) 23 | assert set(anchors).issubset(self.real_genes) 24 | self.pseudo_genes = [] 25 | for anchor, alpha in zip(anchors, alphas): 26 | self.pseudo_genes.append('{}_{}'.format(prefix, anchor)) 27 | for i in range(alpha): 28 | self.pseudo_genes.append('{}_{}_{:0>3d}'.format(prefix, anchor, i)) 29 | genes = self.real_genes + self.pseudo_genes 30 | n = len(genes) 31 | self.matrix = np.zeros((n, n), dtype=np.float32) 32 | self.idx = {gene: i for i, gene in enumerate(genes)} 33 | # Assign values to the correlation matrix 34 | with open(correlation_file_path) as fin: 35 | for line in fin: 36 | line = line.strip() 37 | if line == '': 38 | continue 39 | a, b, cor = line.split(sep) 40 | if string_id is False: 41 | a = int(a) 42 | b = int(b) 43 | i = self.idx[a] 44 | j = self.idx[b] 45 | self.matrix[i,j] = np.float32(cor) 46 | self.matrix[j,i] = np.float32(cor) 47 | 48 | def average_correlation(self): 49 | n = len(self.real_genes) 50 | values = self.matrix[0:n, 0:n][np.nonzero(self.matrix[0:n, 0:n])] 51 | return np.mean(values) 52 | 53 | 54 | def __getitem__(self, item): 55 | a, b = item 56 | i = self.idx[a] 57 | j = self.idx[b] 58 | return self.matrix[i, j] 59 | 60 | def transform(self, transform=None): 61 | if transform is None: 62 | transform = lambda x: 0.5 * x + 0.5 63 | n = len(self.real_genes) + len(self.pseudo_genes) 64 | for i in range(n): 65 | for j in range(n): 66 | self.matrix[i, j] = transform(self.matrix[i, j]) 67 | 68 | def apply_threshold(self, lower_cor, upper_cor, value): 69 | n = len(self.real_genes) + len(self.pseudo_genes) 70 | for i in range(n): 71 | for j in range(n): 72 | if self.matrix[i, j] > lower_cor and self.matrix[i, j] < upper_cor: 73 | self.matrix[i, j] = value 74 | 75 | def to_csv(self, file_name): 76 | n = len(self.real_genes) + len(self.pseudo_genes) 77 | genes = self.real_genes + self.pseudo_genes 78 | with open(file_name, 'w') as f: 79 | for i in range(n): 80 | for j in range(n): 81 | if i == j: 82 | break 83 | else: 84 | f.write(','.join([genes[i], genes[j], str(self.matrix[i, j])])) 85 | f.write("\n") 86 | 87 | def augment(self, dangles): 88 | genes = self.real_genes + self.pseudo_genes 89 | for (a, b), w in dangles.items(): 90 | assert a in genes, "gene is missing from similarity matrix." 91 | assert b in genes, "gene is missing from similarity matrix." 92 | i = self.idx[a] 93 | j = self.idx[b] 94 | self.matrix[i, j] = w 95 | self.matrix[j, i] = w 96 | -------------------------------------------------------------------------------- /netwalk/walk.py: -------------------------------------------------------------------------------- 1 | ''' This module generates walks from a network. 2 | 3 | ''' 4 | import numpy as np 5 | import gensim.models 6 | import seaborn as sns 7 | from matplotlib import pyplot as plt 8 | import pandas as pd 9 | import time 10 | import multiprocessing as mp 11 | 12 | 13 | EPSILON = 1E-6 14 | 15 | class Probability(): 16 | def __init__(self, matrix, gene_names): 17 | n = matrix.shape[0] 18 | assert matrix.shape[0] == matrix.shape[1] 19 | assert len(gene_names) == n 20 | total_prob = matrix.sum(axis=1).reshape(n, 1) 21 | corrections = [] 22 | for i, p in enumerate(total_prob): 23 | if total_prob[i] < EPSILON: 24 | total_prob[i] = 1 25 | corrections.append(i) 26 | self.prob = matrix / total_prob 27 | for i in corrections: 28 | self.prob[i, i] = 1 29 | for i in range(n): 30 | if abs(np.sum(self.prob[i]) - 1) > EPSILON: 31 | self.prob[i] /= (self.prob[i]).sum() 32 | print((self.prob[i]).sum()) 33 | try: 34 | assert abs(np.sum(self.prob[i]) - 1) < EPSILON 35 | except: 36 | print(abs(np.sum(self.prob[i]) - 1)) 37 | raise 38 | self.idx = {name:i for i, name in enumerate(gene_names)} 39 | 40 | def __getitem__(self, gene): 41 | i = self.idx[gene] 42 | return self.prob[i] 43 | 44 | 45 | 46 | class WalkGenerator(object): 47 | ''' Create walks using a graph defined by a similarity matrix. 48 | 49 | Args: 50 | similarity: A dictionary representing the similarity between 51 | pairs of nodes, where the similarity between nodes u and v 52 | is represented by similarity((u, v)). 53 | ''' 54 | def __init__(self, similarity_matrix, genes, walk_length, walk_per_node, fountains=None): 55 | self.walk_length = walk_length 56 | self.nodes = np.copy(genes) 57 | if fountains is None: 58 | self.fountains = np.copy(self.nodes) 59 | else: 60 | self.fountains = np.copy(fountains) 61 | self.starters = np.repeat(self.fountains, walk_per_node) 62 | np.random.shuffle(self.starters) 63 | self.LENGTH = len(self.starters) 64 | self.prob = Probability(similarity_matrix, self.nodes) 65 | 66 | 67 | 68 | def __len__(self): 69 | return self.LENGTH 70 | 71 | def __getitem__(self, i): 72 | ''' Generate a random walk starting from the i-th gene. 73 | 74 | Args: 75 | start: Starting point of the random walk. 76 | length: Length of the random walk. 77 | ''' 78 | if i >= self.LENGTH: 79 | raise StopIteration 80 | current_node = self.starters[i] 81 | walk = [current_node] 82 | for _ in range(self.walk_length): 83 | next_node = np.random.choice(self.nodes, p=self.prob[current_node]) 84 | walk.append(next_node) 85 | current_node = next_node 86 | return walk 87 | 88 | def __call__(self, i): 89 | return self[i] 90 | 91 | 92 | #if __name__ == '__main__': 93 | #similarity_matrix = np.random.rand(15000, 15000) 94 | #genes = np.array(range(15000), dtype=np.uint16) 95 | 96 | #similarity_matrix = np.array([[0.0, 0.0, 0.6, 0.0], 97 | # [0.0, 0.0, 0.3, 0.0], 98 | # [0.6, 0.3, 0.0, 0.0], 99 | # [0.0, 0.0, 0.0, 0.0]]) 100 | # 101 | #genes = ['1', '2', '3', '4'] 102 | #start_time = time.time() 103 | #walks = WalkGenerator(similarity_matrix,genes, 50, 100) 104 | #hours, rem = divmod(time.time() - start_time, 3600) 105 | #minutes, seconds = divmod(rem, 60) 106 | #print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) 107 | #num_cpus = mp.cpu_count() - 1 108 | #pool = mp.Pool(num_cpus) 109 | #arguments = list(range(len(walks))) 110 | #chunk_size = len(walks) // num_cpus 111 | #results = pool.map(walks, arguments, chunksize=chunk_size) 112 | #with open('walks.csv', 'w') as fout: 113 | # for w in results: 114 | # fout.write('{}\n'.format(','.join([str(x) for x in w]))) 115 | 116 | #for w in walks: 117 | # print(w) 118 | 119 | # colour_map = "Greens_r" 120 | # model = gensim.models.Word2Vec(sentences=walks, 121 | # size=5, 122 | # window=2, 123 | # min_count=2, 124 | # workers=3, 125 | # iter=1) 126 | # wv1 = model.wv 127 | # vocab_size = len(genes) 128 | # dist1 = np.zeros((vocab_size, vocab_size)) 129 | # for i, gene_i in enumerate(genes): 130 | # for j, gene_j in enumerate(genes): 131 | # dist1[i,j] = np.linalg.norm(wv1[gene_i] - wv1[gene_j]) 132 | # 133 | # df = pd.DataFrame(dist1, columns=genes, index=genes) 134 | # ax = sns.heatmap(df, cmap=colour_map, square=True) 135 | # plt.show() 136 | -------------------------------------------------------------------------------- /netwalk/walkdataset.py: -------------------------------------------------------------------------------- 1 | ''' This module contains WalkDataset. 2 | ''' 3 | from torch.utils.data import Dataset 4 | from netwalk.utils import Vocabulary 5 | 6 | 7 | class WalkDataset(Dataset): 8 | ''' Create a dataset of walks, where each walk is a sequence of genes. 9 | 10 | Args: 11 | original_walks: a nested list of gene names/IDs. 12 | vocab: A Vocabulary object including all genes in the original_walks. 13 | ''' 14 | def __init__(self, original_walks, vocab): 15 | super(WalkDataset, self).__init__() 16 | self.vocab = vocab 17 | self.walks = self._vocab_index(original_walks, vocab) 18 | 19 | @staticmethod 20 | def _vocab_index(original_walks, vocab): 21 | ''' Translate walks from original node names to integer indices. 22 | Args: 23 | original_walks: The original walks, where each node is represented 24 | by node name/ID. 25 | vocab: A Vocabulary object including all genes in the original_walks. 26 | Returns: 27 | A translated version of original_walks, where each node is 28 | represented with an integer index. These indices starts with 29 | 0 to n with no gap, where n is the number of different nodes 30 | present in at least one of the walks in original_walks. 31 | 32 | ''' 33 | walks = [] 34 | for walk in original_walks: 35 | walks.append([vocab.index[name] for name in walk]) 36 | return walks 37 | 38 | @classmethod 39 | def read_csv(cls, address, sep): 40 | ''' Read dataset from a file. 41 | 42 | Args: 43 | address: Address of a CSV file containing walks. 44 | sep: A field delimiter. 45 | Returns: 46 | data: A list of walks. 47 | genes: The set of all genes that appear in at least one walk. 48 | ''' 49 | data = [] 50 | genes = set() 51 | with open(address) as fin: 52 | for line in fin: 53 | line = line.strip() 54 | if line == '': 55 | continue 56 | walk = [gene.strip() for gene in line.split(sep)] 57 | data.append(walk) 58 | genes.update(walk) 59 | return data, genes 60 | 61 | @classmethod 62 | def from_csv(cls, address, sep): 63 | ''' Create a WalkDataset from a CSV file. 64 | 65 | Args: 66 | address: Address of a CSV file containing walks. 67 | sep: A field delimiter. 68 | Returns: 69 | A WalkDataset object. 70 | ''' 71 | data, genes = cls.read_csv(address, sep) 72 | vocab = Vocabulary(genes) 73 | return cls(data, vocab) 74 | 75 | def __getitem__(self, idx): 76 | return self.walks[idx] 77 | 78 | def __len__(self): 79 | return len(self.walks) 80 | 81 | 82 | class PairWalkDataset(WalkDataset): 83 | ''' Create a dataset of walks, where each walk is a sequence of genes. 84 | 85 | Args: 86 | original_walks: a nested list of gene names/IDs. 87 | vocab: A Vocabulary object including all genes in the original_walks. 88 | ''' 89 | def __init__(self, original_walks, vocab): 90 | super(PairWalkDataset, self).__init__(original_walks, vocab) 91 | 92 | 93 | @classmethod 94 | def from_csv(cls, address, sep): 95 | ''' Create a WalkDataset from a CSV file. 96 | 97 | Args: 98 | address: Address of a CSV file containing walks. 99 | sep: A field delimiter. 100 | Returns: 101 | A WalkDataset object. 102 | ''' 103 | data = [] 104 | pair_walks, genes = cls.read_csv(address, sep) 105 | for walk_walk in pair_walks: 106 | middle = len(walk_walk) // 2 107 | walk_a, walk_b = walk_walk[:middle], walk_walk[middle:] 108 | data.append((walk_a, walk_b)) 109 | vocab = Vocabulary(genes) 110 | return cls(data, vocab) 111 | 112 | @staticmethod 113 | def _vocab_index(original_walks, vocab): 114 | ''' Translate walks from original node names to integer indices. 115 | Args: 116 | original_walks: The original walks, where each node is represented 117 | by node name/ID. 118 | vocab: A Vocabulary object including all genes in the original_walks. 119 | Returns: 120 | A translated version of original_walks, where each node is 121 | represented with an integer index. These indices starts with 122 | 0 to n with no gap, where n is the number of different nodes 123 | present in at least one of the walks in original_walks. 124 | 125 | ''' 126 | walks = [] 127 | for walk_a, walk_b in original_walks: 128 | translated_walk_a = [vocab.index[name] for name in walk_a] 129 | translated_walk_b = [vocab.index[name] for name in walk_b] 130 | walks.append((translated_walk_a, translated_walk_b)) 131 | return walks 132 | -------------------------------------------------------------------------------- /netwalk/translator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | 8 | class ID2NameTranslator(object): 9 | def __init__(self, vocab_file_address, sep=','): 10 | assert os.path.isfile(vocab_file_address) 11 | df = pd.read_csv(vocab_file_address, sep=sep) 12 | df.columns = ['ID', 'Name'] 13 | self.df = df 14 | self.ids = df.iloc[:, 0].values 15 | self.names = df.iloc[:, 1].values 16 | self.__id2name = {ID: name for ID, name in zip(self.ids, self.names)} 17 | 18 | def id2name(self, ensemble_id, default=''): 19 | return self.__id2name.get(ensemble_id, default) 20 | 21 | def names2ids(self, names): 22 | selected = self.df[self.df.iloc[:, 1].isin(names)] 23 | data = [] 24 | for name in names: 25 | data.extend(selected[selected.iloc[:,1] == name].values) 26 | data = pd.DataFrame.from_records(data, columns=['ID', 'Name']) 27 | return list(data.ID), list(data.Name) 28 | 29 | 30 | class IDCovertor(object): 31 | def __init__(self, edge_list_file_addresses, sep=','): 32 | idset = set() 33 | for edge_list_file_address in edge_list_file_addresses: 34 | with open(edge_list_file_address) as fin: 35 | for line in fin: 36 | id1, id2, _ = line.strip().split(sep) 37 | idset.add(id1) 38 | idset.add(id2) 39 | n = len(idset) 40 | self.ids = sorted(idset) 41 | self.id2int = {a_id: i for a_id, i in zip(self.ids, range(n))} 42 | self.int2id = {i: a_id for a_id, i in zip(self.ids, range(n))} 43 | 44 | def ids2ints(self, ids): 45 | return [self.id2int[x] for x in ids if x in self.ids] 46 | 47 | def ints2ids(self, ints): 48 | return [self.int2id[int(x)] for x in ints] 49 | 50 | def save(self, json_file_address): 51 | with open(json_file_address, 'w') as fout: 52 | json.dump(self.id2int, fout) 53 | 54 | @classmethod 55 | def load(cls, json_file_address): 56 | with open(json_file_address) as fin: 57 | id2int = json.load(fin) 58 | convertor = IDCovertor([]) 59 | convertor.id2int = id2int 60 | convertor.int2id = {i: a_id for a_id, i in id2int.items()} 61 | convertor.ids = sorted(id2int.keys()) 62 | return convertor 63 | 64 | def translate(self, input_file_address, output_file_address, sep=','): 65 | df = pd.read_csv(input_file_address, sep=sep, header=None) 66 | df.columns = ['ID1', 'ID2', 'Cor'] 67 | with open(output_file_address, 'w') as fout: 68 | for id1, id2, cor in df.itertuples(index=False): 69 | fout.write('{}{}{}{}{}\n'.format(self.id2int[id1], sep, 70 | self.id2int[id2], sep, 71 | cor)) 72 | 73 | 74 | def vocab2id_and_name(vocab, id_convertor_file_path, id2name_translator_file, default_name='', sep=','): 75 | id2name_translator = ID2NameTranslator(id2name_translator_file, sep=sep) 76 | id_convertor = IDCovertor.load(id_convertor_file_path) 77 | id_names = {} 78 | for k in vocab: 79 | ensemble_id = id_convertor.int2id[int(k)] 80 | name = id2name_translator.id2name(ensemble_id, default_name) 81 | id_names[k] = (ensemble_id, name) 82 | return id_names 83 | 84 | 85 | if __name__ == '__main__': 86 | ensemble_id_name_file = '../skeletal_data/mouse.vocab' 87 | # trans = ID2NameTranslator(ensemble_id_name_file, sep=',') 88 | # assert trans.id2name('ENSMUSG00000064372') == 'mt-Tp' 89 | # assert trans.id2name('ENSMUSG00000106796') == 'AC124394.4' 90 | # edge_list_file_addresses = ['../Skeletal_Cells/anchored_chicken_imm_0.csv', 91 | # '../Skeletal_Cells/anchored_chicken_ost_0.csv', 92 | # '../Skeletal_Cells/anchored_gar_imm_0.csv', 93 | # '../Skeletal_Cells/anchored_gar_ost_0.csv', 94 | # '../Skeletal_Cells/anchored_frog_imm_0.csv', 95 | # '../Skeletal_Cells/anchored_frog_ost_0.csv', 96 | # '../Skeletal_Cells/anchored_mouse_imm_0.csv', 97 | # '../Skeletal_Cells/anchored_mouse_ost_0.csv'] 98 | # output_file_address = '../skeletal_data/translated_chicken_imm.csv' 99 | # convertor = IDCovertor(edge_list_file_addresses, sep=',') 100 | # for edge_list_file_address in edge_list_file_addresses: 101 | # dir_path, file_name = os.path.split(edge_list_file_address) 102 | # output_file_address = os.path.join(dir_path, f'translated_{file_name}') 103 | # convertor.translate(edge_list_file_address, output_file_address, sep=',') 104 | convertor_file = '../skeletal_data/IDConvertor.json' 105 | # convertor.save(convertor_file) 106 | # con = IDCovertor.load(convertor_file) 107 | # assert con.id2int == convertor.id2int 108 | # assert con.int2id == convertor.int2id 109 | # assert con.ids == convertor.ids 110 | # # Test vocab2id_and_name 111 | v2id_name = vocab2id_and_name(['19210', '19211'], convertor_file, ensemble_id_name_file, default_name='', sep=',') 112 | assert v2id_name['19210'][0] == 'ENSMUSG00000114019' 113 | assert v2id_name['19211'][0] == 'ENSMUSG00000114025' 114 | print(v2id_name) 115 | -------------------------------------------------------------------------------- /dangle.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from similarity import Similarity 4 | import numpy as np 5 | import pandas as pd 6 | import random 7 | import os 8 | import json 9 | from sklearn.model_selection import train_test_split 10 | 11 | 12 | def build_backbone(anchors, alphas, weight, edge_percentage): 13 | dangling = {} 14 | for anchor, alpha in zip(anchors,alphas): 15 | pseudo_anchor = 'pseudo_{}'.format(anchor) 16 | dangles = dangling_structure(pseudo_anchor, 17 | alpha, 18 | weight, 19 | edge_percentage) 20 | dangles[(anchor, pseudo_anchor)] = weight 21 | dangling.update(dangles) 22 | return dangling 23 | 24 | 25 | def dangling_structure(gene, alpha, weight, edge_percentage): 26 | num_dangles = alpha 27 | dangles = ['{}_{:0>3d}'.format(gene, i) for i in range(alpha)] 28 | sim = {} 29 | potential_edges = [] 30 | for gene_i in dangles: 31 | for gene_j in dangles: 32 | if gene_i == gene_j: 33 | break 34 | else: 35 | potential_edges.append((gene_i, gene_j)) 36 | random.shuffle(potential_edges) 37 | connected_genes = set() 38 | for gene_i, gene_j in potential_edges: 39 | if {gene_i, gene_j} < connected_genes: 40 | continue 41 | elif len(connected_genes) < num_dangles: 42 | connected_genes.add(gene_i) 43 | connected_genes.add(gene_j) 44 | sim[(gene_i, gene_j)] = weight 45 | 46 | sim[(gene, dangles[0])] = weight 47 | for gene_i, gene_j in potential_edges: 48 | if random.random() < edge_percentage: 49 | sim[(gene_i, gene_j)] = weight 50 | 51 | return sim 52 | 53 | 54 | def main(experiment_name, phenotypes, data_directory, anchor_genes, 55 | num_replicates=1, percent=0.4, num_anchors=50, min_dangle_size=3, 56 | max_dangle_size=10, test_ratio=0.5): 57 | assert isinstance(phenotypes, list) 58 | alphas = random.choices(range(min_dangle_size, max_dangle_size), 59 | k=int(num_anchors * test_ratio)) 60 | assert len(alphas) < len(anchor_genes) 61 | anchor_train_groups = [] 62 | anchor_test_groups = [] 63 | backbones = [] 64 | # Create all backbones 65 | for rep_id in range(num_replicates): 66 | random.shuffle(anchor_genes) 67 | candidates = anchor_genes[:int(num_anchors)] 68 | genes_of_interest_train, genes_of_interest_test = train_test_split( 69 | candidates, 70 | shuffle=True, 71 | test_size=test_ratio) 72 | 73 | anchor_train_groups.append(genes_of_interest_train) 74 | anchor_test_groups.append(genes_of_interest_test) 75 | backbones.append( 76 | build_backbone(anchors=anchor_train_groups[rep_id], alphas=alphas, 77 | weight=1, edge_percentage=percent)) 78 | # Write train anchors to file 79 | with open(os.path.join(experiment_name, 'train_anchors.csv'), 'w') as fout: 80 | for gene_group in anchor_train_groups: 81 | fout.write(','.join(gene_group)) 82 | fout.write("\n") 83 | # Write test anchors to file 84 | with open(os.path.join(experiment_name, 'test_anchors.csv'), 'w') as fout: 85 | for gene_group in anchor_test_groups: 86 | fout.write(','.join(gene_group)) 87 | fout.write("\n") 88 | # Adding the backbones and create the similarity object 89 | for pheno in phenotypes: 90 | file_name = os.path.join(data_directory, "{}.csv".format(pheno)) 91 | for rep_id in range(num_replicates): 92 | sim_file_name = "anchored_{}_{}.csv".format(pheno, str(rep_id)) 93 | out_address = os.path.join(experiment_name, sim_file_name) 94 | similarity = Similarity(file_name, 95 | anchors=anchor_train_groups[rep_id], 96 | alphas=alphas, string_id=True) 97 | similarity.transform() 98 | similarity.apply_threshold(lower_cor=0.2, upper_cor=0.8, 99 | value=0) 100 | similarity.augment(backbones[rep_id]) 101 | 102 | similarity.to_csv(out_address) 103 | 104 | 105 | 106 | if __name__ == '__main__': 107 | parser = argparse.ArgumentParser(description='Generate dangling structures') 108 | parser.add_argument('-c', '--config', metavar='JSON file path', 109 | action='store', required=True, 110 | help='Path to a config file') 111 | args = parser.parse_args() 112 | config_file_address = args.config 113 | with open(config_file_address) as fin: 114 | params = json.load(fin) 115 | homeostasis_genes = pd.read_csv(params['anchor_file_address'], 116 | dtype=str).iloc[:,0].values 117 | main(experiment_name=params['experiment_name'], 118 | phenotypes=params['phenotypes'], 119 | data_directory=params['data_directory'], 120 | anchor_genes=homeostasis_genes, 121 | num_replicates=params['n_replicates'], 122 | percent=params['percentage'], 123 | num_anchors=params['n_anchors'], 124 | min_dangle_size=params['min_dangle_size'], 125 | max_dangle_size=params['max_dangle_size'], 126 | test_ratio=params['test_ratio']) 127 | 128 | -------------------------------------------------------------------------------- /test/test_temp.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from netwalk.temp import * 3 | from netwalk.utils import load 4 | 5 | 6 | class TestTemp(unittest.TestCase): 7 | def test_transform(self): 8 | edge_list = {("gene1", "gene2"): 0.5, 9 | ("gene2", "gene3"): -0.7, 10 | ("gene1", "gene3"): 0.0} 11 | transformed = transform(edge_list) 12 | transformed_dict = {k: val for k, val in transformed.items()} 13 | expected = {("gene1", "gene2"): 0.75, 14 | ("gene2", "gene3"): 0.15, 15 | ("gene1", "gene3"): 0.5} 16 | transformed_vals = list(transformed_dict.values()) 17 | expected_vals = list(expected.values()) 18 | for a, b in zip(transformed_vals, expected_vals): 19 | self.assertAlmostEqual(a, b, places=5) 20 | 21 | edge_list = load("data/fake_networks/network_2.csv", sep=",") 22 | transformed = transform(edge_list) 23 | expected = {("g1", "g2"): 0.9, 24 | ("g2", "g3"): 0.55, 25 | ("g2", "g1"): 0.9, 26 | ("g3", "g2"): 0.55, 27 | ("g2", "g4"): 0.75, 28 | ("g4", "g3"): 0.55, 29 | ("g3", "g1"): 0.75, 30 | ("g1", "g3"): 0.75, 31 | ("g1", "g4"): 0.10, 32 | ("g4", "g1"): 0.10, 33 | ("g4", "g2"): 0.75, 34 | ("g3", "g4"): 0.55} 35 | 36 | transformed_dict = {k: val for k, val in transformed.items()} 37 | transformed_vals = list(transformed_dict.values()) 38 | expected_vals = list(expected.values()) 39 | for a, b in zip(transformed_vals, expected_vals): 40 | self.assertAlmostEqual(a, b, places=5) 41 | 42 | def test_filter(self): 43 | edge_list = {("gene1", "gene2"): 0.5, 44 | ("gene2", "gene3"): -0.7, 45 | ("gene1", "gene3"): 0.0} 46 | transformed = transform(edge_list) 47 | transformed_dict = {k: val for k, val in transformed.items()} 48 | filtered = filter(transformed_dict, exclude=(0.3, 0.6)) 49 | filtered_dict = {k: val for k, val in filtered.items()} 50 | 51 | expected = {("gene1", "gene2"): 0.75, 52 | ("gene2", "gene3"): 0.15} 53 | 54 | filtered_vals = list(filtered_dict.values()) 55 | expected_vals = list(expected.values()) 56 | for a, b in zip(filtered_vals, expected_vals): 57 | self.assertAlmostEqual(a, b, places=5) 58 | 59 | edge_list = {("gene1", "gene2"): 0.5, 60 | ("gene2", "gene3"): -0.7, 61 | ("gene1", "gene3"): 0.0} 62 | 63 | filtered = filter(edge_list, exclude=(0, 0.7)) 64 | filtered_dict = {k: val for k, val in filtered.items()} 65 | 66 | expected = {("gene1", "gene2"): -0.7, 67 | ("gene2", "gene3"): 0.0} 68 | 69 | filtered_vals = list(filtered_dict.values()) 70 | expected_vals = list(expected.values()) 71 | for a, b in zip(filtered_vals, expected_vals): 72 | self.assertAlmostEqual(a, b, places=5) 73 | 74 | def test_overlay_networks(self): 75 | original_edge_list_1 = Similarity({("gene1", "gene2"): 0.5, 76 | ("gene2", "gene3"): 0.7, 77 | ("gene1", "gene3"): 0.6, 78 | ("gene1", "gene4"): 0.3, 79 | ("gene2", "gene4"): 0.1, 80 | ("gene3", "gene4"): 0.0, 81 | ("gene1", "gene3"): 0.0}) 82 | 83 | original_edge_list_2 = Similarity({("gene1", "gene2"): 0.5, 84 | ("gene2", "gene3"): 0.7, 85 | ("gene1", "gene3"): 0.0, 86 | ("gene1", "gene4"): 0.0, 87 | ("gene2", "gene4"): 0.3, 88 | ("gene4", "gene3"): 0.7, 89 | ("gene1", "gene3"): 0.2}) 90 | 91 | edge_list_1 = Similarity({("gene1", "gene2"): 0.5, 92 | ("gene2", "gene3"): 0.7, 93 | ("gene1", "gene3"): 0.6, 94 | ("gene1", "gene4"): 0.3}) 95 | 96 | edge_list_2 = Similarity({("gene1", "gene2"): 0.5, 97 | ("gene2", "gene3"): 0.7, 98 | ("gene2", "gene4"): 0.3}) 99 | 100 | net_1, net_2 = overlay_networks(net_a_similarity=edge_list_1, net_b_similarity=edge_list_2, 101 | original_net_a=original_edge_list_1, original_net_b=original_edge_list_2) 102 | 103 | assert set(net_1.symmetric_key_set()) == set(net_2.symmetric_key_set()) 104 | net_1_dict = {k: val for k, val in net_1.items()} 105 | net_2_dict = {k: val for k, val in net_2.items()} 106 | assert set(net_1_dict.values()) == {0.5, 0.7, 0.6, 0.3, 0.1} 107 | assert set(net_2_dict.values()) == {0.5, 0.7, 0.3, 0.2, 0.0} 108 | 109 | def test_create_spine(self): 110 | net1 = Similarity.load("../data/fake_networks/network_1.csv", sep=",") 111 | net2 = Similarity.load("../data/fake_networks/network_2.csv", sep=",") 112 | 113 | expected_spine = ["g1", "g4"] 114 | expected_pseudo_spine = ["pseudo_g1", "pseudo_g4"] 115 | expected_similarity = {("pseudo_g1", "pseudo_g10"): 0.5, 116 | ("pseudo_g1", "pseudo_g11"): 0.5, 117 | ("pseudo_g1", "pseudo_g4"): -0.8, 118 | ("pseudo_g4", "pseudo_g40"): 0.5, 119 | ("pseudo_g4", "pseudo_g41"): 0.5} 120 | 121 | spine, pseudo_spine, backbone = create_spine(spine=["g1", "g4"], net_a_tsfmd_similarity=net1, 122 | net_b_tsfmd_similarity=net2, 123 | prefix='pseudo_', alpha=2, weight=0.5) 124 | assert set(spine) == set(expected_spine) 125 | assert set(pseudo_spine) == set(expected_pseudo_spine) 126 | 127 | self.assertEqual(len(backbone), len(expected_similarity)) 128 | 129 | for key, val in backbone.items(): 130 | assert key in expected_similarity.keys() 131 | assert expected_similarity[key] == val 132 | 133 | expected_spine = ["g1", "g4", "g3"] 134 | expected_pseudo_spine = ["pseudo_g1", "pseudo_g4", "pseudo_g3"] 135 | expected_similarity = {("pseudo_g1", "pseudo_g3"): 0.5, 136 | ("pseudo_g1", "pseudo_g10"): 0.5, 137 | ("pseudo_g1", "pseudo_g11"): 0.5, 138 | ("pseudo_g4", "pseudo_g1"): -0.8, 139 | ("pseudo_g3", "pseudo_g30"): 0.5, 140 | ("pseudo_g3", "pseudo_g31"): 0.5, 141 | ("pseudo_g4", "pseudo_g40"): 0.5, 142 | ("pseudo_g4", "pseudo_g41"): 0.5} 143 | 144 | spine, pseudo_spine, backbone = create_spine(spine=["g1", "g4", "g3"], net_a_tsfmd_similarity=net1, 145 | net_b_tsfmd_similarity=net2, 146 | prefix='pseudo_', alpha=2, weight=0.5) 147 | 148 | assert set(spine) == set(expected_spine) 149 | assert set(pseudo_spine) == set(expected_pseudo_spine) 150 | 151 | self.assertEqual(len(backbone), len(expected_similarity)) 152 | 153 | for key, val in backbone.items(): 154 | assert key in expected_similarity.keys() 155 | assert expected_similarity[key] == val 156 | 157 | def test_add_anchor(self): 158 | net1 = Similarity.load("../data/fake_networks/network_1.csv", sep=",") 159 | net2 = Similarity.load("../data/fake_networks/network_2.csv", sep=",") 160 | 161 | spine_similarity_1 = {("pseudo_g1", "pseudo_g3"): 0.5, 162 | ("pseudo_g1", "pseudo_g10"): 0.5, 163 | ("pseudo_g1", "pseudo_g11"): 0.5, 164 | ("pseudo_g4", "pseudo_g1"): -0.8, 165 | ("pseudo_g3", "pseudo_g30"): 0.5, 166 | ("pseudo_g3", "pseudo_g31"): 0.5, 167 | ("pseudo_g4", "pseudo_g40"): 0.5, 168 | ("pseudo_g4", "pseudo_g41"): 0.5} 169 | 170 | spine_similarity_2 = {("pseudo_g1", "pseudo_g10"): 0.5, 171 | ("pseudo_g1", "pseudo_g11"): 0.5, 172 | ("pseudo_g4", "pseudo_g1"): -0.8, 173 | ("pseudo_g4", "pseudo_g40"): 0.5, 174 | ("pseudo_g4", "pseudo_g41"): 0.5} 175 | 176 | anchored_net1 = add_anchor(net1, pseudo_similarity=backbone, spine_genes=["g1", "g4", "g3"], 177 | pseudo_spine_genes=["pseudo_g1", "pseudo_g4", "pseudo_g3"], 178 | weight=0.5) 179 | 180 | anchored_net2 = add_anchor(net2, pseudo_similarity=backbone, spine_genes=["g1", "g4"], 181 | pseudo_spine_genes=["pseudo_g1", "pseudo_g4"], 182 | weight=0.7) 183 | 184 | for key, val in anchored_net1.items(): 185 | assert key in spine_similarity_1.keys() 186 | for key, val in anchored_net2.items(): 187 | assert key in spine_similarity_2.keys() 188 | 189 | result_keys = [anchored_net1[x] for x in anchored_net1.keys()] 190 | self.assertListEqual(result_keys, net1.update(spine_similarity_1)) 191 | 192 | result_keys = [anchored_net2[x] for x in anchored_net2.keys()] 193 | self.assertListEqual(result_keys, net2.update(spine_similarity_2)) 194 | -------------------------------------------------------------------------------- /dimensionality_reduction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import json 4 | import argparse 5 | import random 6 | from sklearn.decomposition import PCA 7 | from sklearn.manifold import TSNE 8 | from matplotlib import pyplot 9 | from gensim.models import Word2Vec 10 | import numpy as np 11 | import pandas as pd 12 | import seaborn as sns 13 | from netwalk.translator import ID2NameTranslator, IDCovertor 14 | from scipy.spatial.distance import cdist 15 | from itertools import combinations 16 | 17 | def pca_visualization(model, out_file_name): 18 | x = model[model.wv.vocab] 19 | pca = PCA(n_components=2) 20 | result = pca.fit_transform(x) 21 | # create a scatter plot of the projection 22 | pyplot.scatter(result[:, 0], result[:, 1]) 23 | #pyplot.xlim(-25, 25) 24 | #pyplot.ylim(-25, 25) 25 | words = list(model.wv.vocab) 26 | #for i, word in enumerate(words): 27 | #pyplot.annotate(word, xy=(result[i, 0], result[i, 1])) 28 | pyplot.savefig(out_file_name) 29 | 30 | 31 | def tsne_plot(model, out_file_name, perplexity=30, components=2, init='pca', 32 | num_iter=500, rand_state=0): 33 | labels = [] 34 | tokens = [] 35 | 36 | for word in model.wv.vocab: 37 | tokens.append(model[word]) 38 | labels.append(word) 39 | 40 | tsne_model = TSNE(perplexity=perplexity, n_components=components, 41 | init=init, n_iter=num_iter, random_state=rand_state) 42 | new_values = tsne_model.fit_transform(tokens) 43 | 44 | x = [] 45 | y = [] 46 | for value in new_values: 47 | x.append(value[0]) 48 | y.append(value[1]) 49 | print(labels) 50 | print(labels["pseudo" in labels]) 51 | c=["royalblue" if "pseudo" in x else "orangered" for x in labels] 52 | pyplot.figure(figsize=(16, 16)) 53 | for i in range(len(x)): 54 | pyplot.scatter(x[i], y[i], color=c[i], s=30) 55 | 56 | pyplot.savefig(out_file_name) 57 | 58 | 59 | def tsne_visualize(model, gene, list_names, vocab_length, num_components, out_file_name, converter, translate): 60 | """ Plot in seaborn the results from the t-SNE dimensionality reduction algorithm of the vectors of a query gene, 61 | its list of most similar genes, and a list of other genes. 62 | vv """ 63 | arrays = np.empty((0, vocab_length), dtype='f') 64 | gene_labels = [gene] 65 | color_list = ['red'] 66 | 67 | # adds the vector of the query gene 68 | arrays = np.append(arrays, model.wv.__getitem__([gene]), axis=0) 69 | 70 | # gets list of most similar genes 71 | close_genes = model.wv.most_similar([gene]) 72 | 73 | # adds the vector for each of the closest genes to the array 74 | for gne_score in close_genes: 75 | gne_vector = model.wv.__getitem__([gne_score[0]]) 76 | gene_labels.append(gne_score[0]) 77 | color_list.append('blue') 78 | arrays = np.append(arrays, gne_vector, axis=0) 79 | 80 | # adds the vector for each of the genes from list_names to the array 81 | for gne in list_names: 82 | gne_vector = model.wv.__getitem__([gne]) 83 | gene_labels.append(gne) 84 | color_list.append('green') 85 | arrays = np.append(arrays, gne_vector, axis=0) 86 | 87 | # Reduces the dimensionality from 300 to 50 dimensions with PCA 88 | reduc = PCA(n_components=num_components).fit_transform(arrays) 89 | # Finds t-SNE coordinates for 2 dimensions 90 | np.set_printoptions(suppress=True) 91 | Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(reduc) 92 | 93 | gene_labels = ints_to_names(gene_labels, translate, converter) 94 | 95 | # Sets everything up to plot 96 | df = pd.DataFrame({'x': [x for x in Y[:, 0]], 97 | 'y': [y for y in Y[:, 1]], 98 | 'genes': gene_labels, 99 | 'color': color_list}) 100 | 101 | fig, _ = pyplot.subplots() 102 | fig.set_size_inches(9, 9) 103 | 104 | # Basic plot 105 | sns.set_style("ticks") 106 | p1 = sns.regplot(data=df, 107 | x="x", 108 | y="y", 109 | fit_reg=False, 110 | marker="o", 111 | scatter_kws={'s': 40, 112 | 'facecolors': df['color'] 113 | } 114 | ) 115 | 116 | # Adds annotations one by one with a loop 117 | for line in range(0, df.shape[0]): 118 | p1.text(df["x"][line], 119 | df['y'][line], 120 | ' ' + df["genes"][line].title(), 121 | horizontalalignment='left', 122 | verticalalignment='bottom', size='medium', 123 | color=df['color'][line], 124 | weight='normal' 125 | ).set_size(15) 126 | 127 | pyplot.xlim(Y[:, 0].min()-50, Y[:, 0].max()+50) 128 | pyplot.ylim(Y[:, 1].min()-50, Y[:, 1].max()+50) 129 | 130 | pyplot.title('t-SNE visualization for {}'.format(gene.title())) 131 | pyplot.savefig(out_file_name) 132 | 133 | def names_to_ints(names, trans, convertor): 134 | ids, names = trans.names2ids(names) 135 | ints = [str(i) for i in convertor.ids2ints(ids)] 136 | return ints, ids, names 137 | 138 | def ints_to_names(ints, trans, convertor): 139 | #int to id 140 | ids = [i for i in convertor.ints2ids(ints)] 141 | #id to name 142 | names = [trans.id2name(x) for x in ids] 143 | return names 144 | 145 | if __name__ == '__main__': 146 | parser = argparse.ArgumentParser(description='Generate datasets') 147 | parser.add_argument('-c', '--config', metavar='JSON file path', 148 | action='store', required=True, 149 | help='Path to a config file') 150 | args = parser.parse_args() 151 | # read training config 152 | with open(args.config) as fin: 153 | params = json.load(fin) 154 | 155 | ensemble_id_name_file = params['vocab'] 156 | convertor_file = os.path.join(params['experiment_name'],'IDConvertor.json') 157 | trans = ID2NameTranslator(ensemble_id_name_file, sep=',') 158 | convertor = IDCovertor.load(convertor_file) 159 | 160 | for rep_id in range(params['n_replicates']): 161 | for pheno in params['phenotypes']: 162 | path = os.path.join(params['experiment_name'], 163 | '{}_{}.model'.format(pheno, rep_id)) 164 | viz_path = os.path.join(params['experiment_name'], 165 | '{}_{}_tsne.pdf'.format(pheno, 166 | rep_id)) 167 | pca_path = os.path.join(params['experiment_name'], 168 | '{}_{}_pca.pdf'.format(pheno, 169 | rep_id)) 170 | # load model 171 | model = Word2Vec.load(path) 172 | genes = list(model.wv.vocab) 173 | # make tsne 174 | #tsne_plot(model, out_file_name=viz_path) 175 | #pca_visualization(model, out_file_name=pca_path) 176 | names = params['select_genes'] 177 | ints, ids, names = names_to_ints(names, trans, convertor) 178 | for ind, g, name in zip(ints, ids, names): 179 | sim_path = os.path.join(params['experiment_name'], 180 | '{}_{}_most_similar_to_{}.pdf'.format(pheno, 181 | rep_id, name)) 182 | rand_path = os.path.join(params['experiment_name'], 183 | '{}_{}_random_compared_to_{}.pdf'.format(pheno, 184 | rep_id, g)) 185 | # make a visualization of select genes and their most similar genes 186 | if ind in model.wv.vocab: 187 | negative_ints = [i[0] for i in model.wv.most_similar(negative=[ind])] 188 | tsne_visualize(model, ind, list_names=negative_ints, vocab_length=params['embd_dim'], num_components=2, out_file_name=sim_path, translate=trans,converter=convertor) 189 | #sampled_ints = random.choices(genes, k=20) 190 | #tsne_visualize(model, ind, list_names=sampled_ints, vocab_length=params['embd_dim'], num_components=2, out_file_name=rand_path, translate=trans,converter=convertor) 191 | for pheno_1, pheno_2 in combinations(params['phenotypes'], 2): 192 | path_1 = os.path.join(params['experiment_name'], 193 | '{}_{}.model'.format(pheno_1, rep_id)) 194 | path_2 = os.path.join(params['experiment_name'], 195 | '{}_{}.model'.format(pheno_2, rep_id)) 196 | model_1 = Word2Vec.load(path_1) 197 | model_2 = Word2Vec.load(path_2) 198 | ints, ids, names = names_to_ints(names, trans, convertor) 199 | targets_in_model1 = [ (gene_i, name_i) for gene_i, name_i in zip(ints, names) if gene_i in model_1.wv.vocab] 200 | targets_in_model2 = [ (gene_i, name_i) for gene_i, name_i in zip(ints, names) if gene_i in model_2.wv.vocab] 201 | x1 = np.array([model_1.wv[gene_i] for gene_i, _ in targets_in_model1]) 202 | x2 = np.array([model_2.wv[gene_i] for gene_i, _ in targets_in_model2]) 203 | # Calculate the distance between elements of x1 and x2 as a distance matrix 204 | dist = cdist(x1, x2, metric='cosine')/2 205 | df = pd.DataFrame(dist) 206 | 207 | df.columns = [name_i for _, name_i in targets_in_model2] 208 | df.index = [name_i for _, name_i in targets_in_model1] 209 | matrix_path = os.path.join(params['experiment_name'], 210 | '{}_{}_{}_matrix.pdf'.format(pheno_1, pheno_2, rep_id)) 211 | pyplot.figure() 212 | print(df.index, df.columns) 213 | print(df.shape) 214 | ax = sns.heatmap(df, square=True, vmin=0, vmax=1) 215 | #ax.tick_params(left=False, bottom=False) 216 | ax.set_yticklabels(list(df.index)) 217 | ax.set_xticklabels(list(df.columns)) 218 | figure = ax.get_figure() 219 | figure.savefig(matrix_path) 220 | pyplot.close() 221 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Juxtapose 2 | 3 | 4 |
5 | Table of Contents 6 |
    7 |
  1. 8 | About The Project 9 |
  2. 10 |
  3. 11 | Getting Started 12 | 16 |
  4. 17 |
  5. Usage 18 | 23 |
  6. 24 |
  7. Contributing
  8. 25 |
  9. Versioning
  10. 26 |
  11. Contact
  12. 27 |
28 |
29 | 30 | 31 | 32 | 33 | ## About The Project 34 |

35 | 36 |

37 | 38 | Welcome to Juxtapose, a Python tool that can be used to compare gene co-expression networks (GCNs). Juxtapose, together with different similarity measures, can be utilized for comparative transcriptomics between a set of organisms. While we focus on its application to comparing co-expression networks across species in evolutionary studies, Juxtapose is also generalizable to co-expression network comparisons across tissues or conditions within the same species. A word embedding strategy commonly used in natural language processing was utilized in order to generate gene embeddings based on walks made throughout the GCNs. 39 | 40 | You may also suggest changes by forking this repo and creating a pull request or opening an issue. 41 | 42 | 43 | ## Getting Started 44 | 45 | The following steps will guide you through the process of running Juxtapose on your local machine or an [AWS spot instance](https://aws.amazon.com/ec2/spot/?cards.sort-by=item.additionalFields.startDateTime&cards.sort-order=asc). 46 | 47 | ### Prerequisites 48 | 49 | The main dependencies of Juxtapose are gensim, scikit-learn, numpy, pandas, and scipy. See requirements.txt for the complete list of requirements. 50 | 51 | ### Installation 52 | 53 | It is a good practice to use a virtual environment for deploying Python programs. Using conda, we can create an environment named juxtapose. The environment name is arbitrary. 54 | ```sh 55 | conda create -n juxtapose python=3.6 56 | ``` 57 | 58 | After downloading the Juxtapose repository, the following command can be run to install requirements. 59 | ```sh 60 | make setup 61 | ``` 62 | ### Adding more tests 63 | 64 | New tests should be added as modules where their names start with test_ under test directory. 65 | 66 | 67 | ## Usage 68 | 69 | In order to run Juxtapose, two JSON files are required containing the desired parameters for (1) creating an anchored network using a set of genes and making walks through this network and (2) running an embedding method to obtain pairwise local distances between genes as well as a global similarity between networks, and results including visualizations from biclustering local pairwise distances. 70 | 71 | ### Small network embedding 72 | Let us take an example of embedding a simple line network. 73 | 74 |

75 | 76 |

77 | 78 | We require a csv file that that contains the edge list representation of the network. In our case, we have line_1.csv and line_2.csv. 79 | 80 | Content of line_1.csv and line_2.csv: 81 | ```sh 82 | 0,1,1 83 | 1,0,1 84 | 1,2,1 85 | 2,1,1 86 | 2,3,1 87 | 3,2,1 88 | 3,4,1 89 | 4,3,1 90 | 4,5,1 91 | 5,4,1 92 | 5,6,1 93 | 6,5,1 94 | 6,7,1 95 | 7,6,1 96 | 7,8,1 97 | 8,7,1 98 | 8,9,1 99 | 9,8,1 100 | 9,10,1 101 | 10,9,1 102 | ``` 103 | We have the config/JSON files stored in the test/data folder. 104 | The example contents of line-config.json used for adding anchors to the networks: 105 | ```sh 106 | { 107 | "n_replicates": 10, 108 | "percentage": 0.4, 109 | "n_anchors": 6, 110 | "anchor_test_ratio": 0.5, 111 | "min_dangle_size": 3, 112 | "max_dangle_size": 10, 113 | "anchor_file_address": "test/data/line_anchors.csv", 114 | "phenotypes": ["1", "2"], 115 | "experiment_name": "Line", 116 | "data_directory": "test/data" 117 | } 118 | ``` 119 | 120 | The example contents of Line-train-config.json used for model training and visualizations: 121 | ```sh 122 | { 123 | "experiment_name": "Line", 124 | "phenotypes": ["1", "2"], 125 | "walk_per_node": 1000, 126 | "walk_length": 50, 127 | "n_iter": 1, 128 | "n_workers": 20, 129 | "embd_dim": 10, 130 | "window": 2, 131 | "min_count": 2, 132 | "negatives": 5, 133 | "alpha": 0.01, 134 | "n_replicates": 1, 135 | "min_alpha": 0.001 136 | } 137 | ``` 138 | To run the anchoring step, we also require the genes/nodes of the network that will be used as the anchor points in the networks that are going to be compared. As the networks will be compared, these synthetic structures that are attached to the real networks should be the same. We have provided line_anchors.csv for this example, but this list can be catered or limited to any sets of nodes a user would like to select as potential anchors. The name of the file used needs to be set in the config file using the "anchor_file_address" parameter. 139 | 140 | To add anchor nodes, run the following command. 141 | ```sh 142 | python3 dangle.py --config test/data/line-config.json 143 | ``` 144 | 145 | To generate the intermediate walk files, if necessary, run the following command. 146 | ```sh 147 | python3 dataset_generator.py --config test/data/Line-train-config.json 148 | ``` 149 | 150 | Next, runner.py will train the models for each network, calculate the local and global similarity measures between genes and bicluster the local similarity results. If a full co-expression network is used and it is not possible to generate the complete matrix, there is also an option to select only a percentage of each bicluster in order to make a representative visualization. 151 | ```sh 152 | python3 runner.py --config test/data/Line-train-config.json 153 | ``` 154 | It should also be noted if the models for the networks have already been trained and only the similarity measures and biclustering need to be run, then the option "no-train" can be specified as below when running. 155 | ```sh 156 | python3 runner.py --config test/data/Line-train-config.json --no-train 157 | ``` 158 | We have provided other datasets (circle, cross, heart, and brain) that can be used of various sizes and complexity/density for further testing. All can be found in the test data folder. 159 | 160 | ### Large networks 161 | It will not always be possible to compare larger networks on many machines due to the large memory requirements as the number of edges in the networks increases. As such, we recommend an AWS spot instance for more affordable resources if no other resources are available to you. In order to set up an instance that will work for a larger network, e.g. 10,000+ genes, one option would be to go to the [EC2 Dashboard](https://aws.amazon.com/ec2/getting-started/) and make a spot request. 162 | 163 | Make a spot request. 164 |

165 | 166 |

167 | 168 | Selecting an AMI. We recommend using Ubuntu Server 18.04 LTS (HVM), SSD Volume Type. 169 | 170 | Selecting an instance type. 171 |

172 | 173 |

174 | 175 | This request will also require a [key pair](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html#having-ec2-create-your-key-pair) and [enabling inbound SSH traffic from your IP address to your instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/authorizing-access-to-an-instance.html). If these have not been set up already, there is the option to create a new key pair and security group below where they are requested in the spot instance template. The remaining parts of the spot request are optional and can be changed according to your needs. 176 | 177 |

178 | 179 | 180 |

181 | 182 | Once the instance is created, use ssh to go to the instance. A generic example is provided below. 183 | ```sh 184 | ssh -i "keypair.pem" ubuntu@ec2-52-23-241-60.compute-1.amazonaws.com 185 | ``` 186 | The actual command to ssh to the instance can be obtained using the Amazon EC2 console. Go to Instances and click the Connect button, which will provide the required command. 187 | 188 |

189 | 190 |

191 | 192 | Then the following will need to be run to set up python on the intance. 193 | ```sh 194 | sudo apt update 195 | sudo apt install python3-pip 196 | python3 -m pip install --user numpy scipy matplotlib 197 | pip3 install --upgrade gensim 198 | pip3 install seaborn 199 | pip3 install -U scikit-learn 200 | pip3 install torch torchvision 201 | ``` 202 | A [volume](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ebs-creating-volume.html) is also required in order to store data and results. The volume will need to be attached to the instance as follows. 203 |

204 | 205 | 206 |

207 | 208 | The volume created to store the data as well as Juxtapose can be attached as follows. 209 | ```sh 210 | lsblk # the volume we made is called xvdf 211 | mkdir experiment # make a directory 212 | chmod -R 744 experiment/ # change its permissions 213 | sudo mount /dev/xvdf experiment/ # mount the volume to the new directory 214 | cd experiment/ # go to the directory and start working 215 | ``` 216 | After the volume is attached to the spot instance, the code can be downloaded into the folder that has had the volume mounted to it and Juxtapose can be run as was done above with the line network example. 217 | 218 | ### Translating IDs to integers 219 | There are also options to translate node IDs to integers if they are not in the original networks. Converting the names to integers can save a lot of memory and result in a quicker analysis. The following commands can be used in order to convert names to integers after the anchoring procedure has been completed and all nodes that will be a part of the networks are now included in the anchored network files, i.e. experiment_anchored_*.csv. 220 | 221 | ```sh 222 | python3 test_translator.py --config test/data/line-config.json 223 | ``` 224 | 225 | This will produce files named the same as the original anchored networks (translated_anchored_*.csv). Also, a JSON file, IDConvertor.json, will be produced to save the translation between integers and the original IDs. This allows for easy translation bake to the original names or IDs for downstream analyses. 226 | 227 | Other means of visualization are available that have not been covered here are included in dimensionality_reduction.py including t-SNE and PCA visualizations of the embedded datasets. Tutorial on how to visualize specific genes as well as the genes closest or furthest from them in the embedding space will be added in the future. 228 | 229 | 230 | ## Versioning 231 | 232 | We use [Semantic Versioning 2.0.0](http://semver.org/) for versioning. 233 | 234 | 235 | 236 | ## Contact 237 | 238 | **Katie Ovens** - katie.ovens@usask.ca 239 | Project Link: [https://github.com/klovens/juxtapose](https://github.com/klovens/juxtapose) 240 | -------------------------------------------------------------------------------- /runner.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import json 4 | import argparse 5 | import copy 6 | import warnings 7 | import random 8 | from scipy import stats 9 | from itertools import combinations 10 | from sklearn import linear_model 11 | from scipy.stats.mstats import gmean 12 | from network_stats import alignment_permutation_test 13 | import numpy as np 14 | import pandas as pd 15 | import seaborn as sns 16 | import matplotlib.pyplot as plt 17 | from sklearn.cluster import SpectralBiclustering 18 | from dimensionality_reduction import * 19 | from gensim.models import Word2Vec 20 | from scipy.optimize import linear_sum_assignment 21 | from sklearn.metrics.pairwise import cosine_similarity 22 | from netwalk.translator import IDCovertor 23 | import logging 24 | from collections import Counter 25 | from multiprocessing import Pool 26 | from scipy.spatial.distance import cdist 27 | logging.basicConfig(level=logging.DEBUG, filemode='w', filename='Experiment.log') 28 | 29 | 30 | warnings.filterwarnings("ignore", category=DeprecationWarning) 31 | 32 | 33 | def cdist2(x1, x2, metric): 34 | if metric == 'angular': 35 | d = 1 - cdist(x1, x2, metric='cosine') 36 | return np.clip(a=d, a_min=-1, a_max=1) / np.pi 37 | else: 38 | return cdist(x1, x2, metric) 39 | 40 | def linear_transform(path_1, path_2, mbd_dim): 41 | model_1 = Word2Vec.load(path_1) 42 | model_2 = Word2Vec.load(path_2) 43 | genes_1 = sorted(model_1.wv.vocab) 44 | vocab_1_size = len(genes_1) 45 | genes_2 = sorted(model_2.wv.vocab) 46 | vocab_2_size = len(genes_2) 47 | 48 | backbone = [g for g in genes_1 if "pseudo" in g] 49 | 50 | # transform with linear regression 51 | model = linear_model.LinearRegression() 52 | model.fit(model_1.wv[backbone], model_2.wv[backbone]) 53 | 54 | # transform network 1 model 55 | for g in genes_1: 56 | model_1.wv[g] = np.array(model.predict([model_1.wv[g]])) 57 | # transform network 2 model 58 | for g in genes_2: 59 | model_2.wv[g] = np.array(model.predict([model_2.wv[g]])) 60 | 61 | dist1 = np.zeros((vocab_1_size, vocab_2_size)) 62 | for i, gene_i in enumerate(genes_1): 63 | for j, gene_j in enumerate(genes_2): 64 | dist1[i, j] = cosine_similarity( 65 | model_1[gene_i].reshape(1, mbd_dim), 66 | model_2[gene_j].reshape(1, mbd_dim)) 67 | return dist1, genes_1, genes_2 68 | 69 | 70 | def angular_dist(sim_matrix, genes1, genes2): 71 | vocab_size1 = len(genes1) 72 | vocab_size2 = len(genes2) 73 | dist = np.zeros((vocab_size1, vocab_size2)) 74 | for i, gene_i in enumerate(genes1): 75 | for j, gene_j in enumerate(genes2): 76 | dist[i, j] = np.arccos(np.clip(a=sim_matrix[i, j], a_min=-1, a_max=1)) / np.pi 77 | return dist 78 | 79 | 80 | def match_dims(sim_matrix): 81 | numrows, numcols = sim_matrix.shape 82 | max_sim = np.amax(sim_matrix) + 1 83 | if numrows > numcols: 84 | slack = numrows - numcols 85 | new_cols = np.ones((numrows, slack)) * max_sim 86 | sim_matrix = np.concatenate((sim_matrix, new_cols), axis=1) 87 | elif numrows < numcols: 88 | slack = numcols - numrows 89 | new_rows = np.ones((slack, numcols)) * max_sim 90 | sim_matrix = np.concatenate((sim_matrix, new_rows), axis=0) 91 | 92 | return sim_matrix 93 | 94 | 95 | def compare_anchors(dist_matrix, genes_1, genes_2, train_anchors, convertor, substr='pseudo_'): 96 | gene_ids_1 = convertor.ints2ids(genes_1) 97 | gene_ids_2 = convertor.ints2ids(genes_2) 98 | sub = [substr+x for x in train_anchors] 99 | 100 | for prefix in sub: 101 | anchor_dist = [] 102 | dangle_1 = [s for s in gene_ids_1 if prefix in s] 103 | dangle_2 = [s for s in gene_ids_2 if prefix in s] 104 | # convert dangle ids to ints 105 | dangle_1 = convertor.ids2ints(dangle_1) 106 | dangle_2 = convertor.ids2ints(dangle_2) 107 | idx_1 = [] 108 | idx_2 = [] 109 | for i, item in enumerate(genes_1): 110 | for d in dangle_1: 111 | if int(item) == d: 112 | idx_1.append(i) 113 | for i, item in enumerate(genes_2): 114 | for d in dangle_2: 115 | if int(item) == d: 116 | idx_2.append(i) 117 | 118 | for i,j in zip(idx_1, idx_2): 119 | anchor_dist.append(dist_matrix[i, j]) 120 | 121 | pvalue = 0 122 | for i in range(0,1000): 123 | rand_dist = [] 124 | for i in range(0,len(dangle_1)): 125 | rand_int = random.randint(0, min(len(genes_1)-1, len(genes_2)-1)) 126 | rand_dist.append(dist_matrix[rand_int, rand_int]) 127 | pvalue = pvalue + (sum(anchor_dist) > sum(rand_dist)) 128 | # get the test statistic and p-value 129 | # statistic, pvalue = stats.ttest_ind(anchor_dist, rand_dist) 130 | print(pvalue/1000) 131 | 132 | return pvalue 133 | 134 | 135 | def biclustering(dist, genes_1, genes_2, x_label, y_label, out_file, experiment, id_convertor, n_clusters=3, precent_visualize=0.1): 136 | model = SpectralBiclustering(n_clusters=n_clusters, n_components=12, n_best=6, 137 | init='random', random_state=1) 138 | 139 | m, n = dist.shape 140 | assert m == len(genes_1) and n == len(genes_2) 141 | model.fit(dist) 142 | rows = [(idx, clust_id) for idx, clust_id in enumerate(model.row_labels_)] 143 | selected_rows = random.choices(rows, k=int(precent_visualize * len(rows))) 144 | selected_rows_name = [genes_1[idx] for idx, _ in selected_rows] 145 | selected_rows_clust_ids = [clust_id for _, clust_id in selected_rows] 146 | selected_rows_indices = [idx for idx, _ in selected_rows] 147 | # Slect columns 148 | cols = [(idx, clust_id) for idx, clust_id in enumerate(model.column_labels_)] 149 | selected_cols = random.choices(cols, k=int(precent_visualize * len(cols))) 150 | selected_cols_names = [genes_2[idx] for idx, _ in selected_cols] 151 | selected_cols_clust_ids = [clust_id for _, clust_id in selected_cols] 152 | selected_cols_indices = [idx for idx, _ in selected_cols] 153 | # Selected dist 154 | selected_dist = dist[selected_rows_indices] [:, selected_cols_indices] 155 | # Sort rows 156 | sorted_rows_indices = np.argsort(selected_rows_clust_ids) 157 | selected_dist = selected_dist[sorted_rows_indices, :] 158 | selected_row_names = [selected_rows_name[i] for i in sorted_rows_indices] 159 | #selected_row_names = selected_rows_name[sorted_rows_indices] 160 | # sort columns 161 | sorted_cols_indices = np.argsort(selected_cols_clust_ids) 162 | selected_dist = selected_dist[:, sorted_cols_indices] 163 | selected_cols_names = [selected_cols_names[i] for i in sorted_cols_indices] 164 | 165 | result = pd.DataFrame(selected_dist, columns=selected_cols_names, index=selected_rows_name) 166 | 167 | ax = sns.heatmap(result, cmap="Greens_r", square=True) 168 | plt.title("Biclustering Results") 169 | ax.set_yticklabels([]) 170 | ax.set_xticklabels([]) 171 | ax.tick_params(left=False, bottom=False) 172 | ax.set_ylabel('{} genes'.format(x_label)) 173 | ax.set_xlabel('{} genes'.format(y_label)) 174 | figure = ax.get_figure() 175 | figure.savefig(out_file) 176 | plt.close() 177 | 178 | for bic in range(n_clusters*n_clusters): 179 | #print(bic) 180 | r = list(model.rows_[bic]) 181 | rows = [i for (i, b) in zip(genes_1, r) if b] 182 | 183 | c = list(model.columns_[bic]) 184 | columns = [i for (i, b) in zip(genes_2, c) if b] 185 | 186 | rows = id_convertor.ints2ids([int(k) for k in rows]) 187 | columns = id_convertor.ints2ids([int(k) for k in columns]) 188 | 189 | cluster_path = os.path.join(experiment, f'{bic}_{x_label}_{y_label}_biclustering.csv') 190 | with open(cluster_path, 'w') as fout: 191 | fout.write(','.join(rows)) 192 | fout.write("\n") 193 | fout.write(','.join(columns)) 194 | 195 | def get_distance(path_1, path_2, mbd_dim): 196 | model_1 = Word2Vec.load(path_1) 197 | model_2 = Word2Vec.load(path_2) 198 | genes_1 = sorted(model_1.wv.vocab) 199 | vocab_1_size = len(genes_1) 200 | genes_2 = sorted(model_2.wv.vocab) 201 | vocab_2_size = len(genes_2) 202 | logging.info(f'Read {vocab_1_size} gene from the model in {path_1}') 203 | logging.info(f'Read {vocab_2_size} gene from the model in {path_2}') 204 | x1 = np.array([model_1.wv[gene_i] for gene_i in genes_1]) 205 | x2 = np.array([model_2.wv[gene_i] for gene_i in genes_2]) 206 | dist = cdist2(x1, x2, 'cosine')/2 207 | return dist, genes_1, genes_2 208 | 209 | 210 | def make_heatmap(dist, image_path): 211 | df = pd.DataFrame(dist) 212 | plt.figure() 213 | ax = sns.heatmap(df, cmap='Greens_r', square=True) 214 | ax.tick_params(left=False, bottom=False) 215 | ax.set_yticklabels([]) 216 | ax.set_xticklabels([]) 217 | figure = ax.get_figure() 218 | figure.savefig(image_path) 219 | plt.close() 220 | 221 | 222 | def read_anchors(anchor_path, non_anchor_path): 223 | with open(anchor_path, 'r') as f: 224 | anchors = list(csv.reader(f, delimiter=',')) 225 | with open(non_anchor_path, 'r') as f: 226 | non_anchors = list(csv.reader(f, delimiter=',')) 227 | return anchors, non_anchors 228 | 229 | 230 | def train(params): 231 | for pheno in params['phenotypes']: 232 | for rep_id in range(params['n_replicates']): 233 | rep_id = str(rep_id) 234 | walks_path = os.path.join(params['experiment_name'], 235 | '{}_{}_walks.csv'.format(pheno, rep_id)) 236 | with open(walks_path) as fin: 237 | walks = list(csv.reader(fin)) 238 | model = Word2Vec(sentences=walks, 239 | size=params['embd_dim'], 240 | window=params['window'], 241 | min_count=params['min_count'], 242 | workers=params['n_workers'], 243 | iter=params['n_iter'], 244 | negative=params['negatives'], 245 | alpha=params['alpha'], 246 | sg = 1, 247 | min_alpha=params['min_alpha']) 248 | # Write model to file 249 | model.save(os.path.join(params['experiment_name'], 250 | '{}_{}.model'.format(pheno, rep_id))) 251 | 252 | 253 | def visualize(params): 254 | for rep_id in range(params['n_replicates']): 255 | rep_id = str(rep_id) 256 | for pheno_1, pheno_2 in combinations(params['phenotypes'], 2): 257 | path_1 = os.path.join(params['experiment_name'], 258 | '{}_{}.model'.format(pheno_1, rep_id)) 259 | path_2 = os.path.join(params['experiment_name'], 260 | '{}_{}.model'.format(pheno_2, rep_id)) 261 | viz_path = os.path.join(params['experiment_name'], 262 | '{}_vs_{}_{}.pdf'.format(pheno_1, pheno_2, 263 | rep_id)) 264 | make_heatmap(path_1, path_2, 265 | params['embd_dim'], 266 | image_path=viz_path) 267 | 268 | 269 | if __name__ == '__main__': 270 | anchor_stats = [] 271 | parser = argparse.ArgumentParser(description='Generate datasets') 272 | parser.add_argument('-c', '--config', metavar='JSON file path', 273 | action='store', required=True, 274 | help='Path to a config file') 275 | parser.add_argument('-n', '--no-train', dest='no_train', 276 | action='store_true', default=False, 277 | help='Skip training and only produce visualizations.') 278 | args = parser.parse_args() 279 | # read training config 280 | with open(args.config) as fin: 281 | params = json.load(fin) 282 | logging.info('Read parameters') 283 | # Train models for all replicates 284 | if args.no_train is False: 285 | train(params) 286 | logging.info('Start training ...') 287 | 288 | # Generate visualizations 289 | train_anchor_path = os.path.join(params['experiment_name'], 'train_anchors.csv') 290 | test_anchor_path = os.path.join(params['experiment_name'], 'test_anchors.csv') 291 | 292 | train_anchors, test_anchors = read_anchors(anchor_path=train_anchor_path, non_anchor_path=test_anchor_path) 293 | logging.info('Using {} potential anchors for training and {} for testing'.format(len(train_anchors), 294 | len(test_anchors))) 295 | convertor_file = os.path.join(params['experiment_name'],'IDConvertor.json') 296 | id_convertor = IDCovertor.load(convertor_file) 297 | 298 | for rep_id in range(params['n_replicates']): 299 | logging.info(f'Start working on replicate {rep_id}') 300 | rep_id = str(rep_id) 301 | for pheno_1, pheno_2 in combinations(params['phenotypes'], 2): 302 | logging.info(f'Start working on {pheno_1} and {pheno_2}') 303 | path_1 = os.path.join(params['experiment_name'], 304 | '{}_{}.model'.format(pheno_1, rep_id)) 305 | path_2 = os.path.join(params['experiment_name'], 306 | '{}_{}.model'.format(pheno_2, rep_id)) 307 | viz_path = os.path.join(params['experiment_name'], 308 | '{}_vs_{}_{}.pdf'.format(pheno_1, pheno_2, 309 | rep_id)) 310 | 311 | dist, gene_names_1, gene_names_2 = get_distance(path_1, path_2, params['embd_dim']) 312 | logging.info('Calculated distance matrix') 313 | #ang_dist = angular_dist(dist, gene_names_1, gene_names_2) 314 | ang_dist = dist 315 | del(dist) 316 | logging.info('Calculated angular distance matrix') 317 | #make_heatmap(ang_dist, viz_path) 318 | #pvalue = alignment_permutation_test(vocab1_length=len(gene_names_1), vocab2_length=len(gene_names_2), distance=ang_dist, actual_score=global_cost) 319 | #print(pvalue) 320 | bic_path = os.path.join(params['experiment_name'], 321 | '{}_vs_{}_{}_biclustering.pdf'.format(pheno_1, pheno_2, 322 | rep_id)) 323 | biclustering(ang_dist, gene_names_1, gene_names_2, pheno_1, pheno_2, bic_path, params['experiment_name'], id_convertor, n_clusters=5) 324 | #logging.info('Finsihed biclustering') 325 | 326 | expanded_matrix = match_dims(ang_dist) 327 | #logging.info('Expanded the angular distance matrix') 328 | row_ind, col_ind = linear_sum_assignment(expanded_matrix) 329 | #logging.info('Calculated the Huangarian distance matrix') 330 | globa_scores = expanded_matrix[row_ind, col_ind] 331 | global_cost = globa_scores.sum() 332 | norm = max(len(row_ind), len(col_ind)) 333 | global_cost = global_cost/norm 334 | print(pheno_1, pheno_2, global_cost) 335 | #matches = [i for i, j in zip(row_ind, col_ind) if i == j] 336 | #print(len(matches)/norm) 337 | #pvalue = compare_anchors(ang_dist, gene_names_1, gene_names_2, train_anchors[int(rep_id)], convertor=id_convertor) 338 | #anchor_stats.append(pvalue) 339 | 340 | #with open(os.path.join(params['experiment_name'],'stats.csv'),'w') as fout: 341 | # csv_out = csv.writer(fout) 342 | # csv_out.writerow(['statistic','pval']) 343 | # for row in anchor_stats: 344 | # csv_out.writerow(row) 345 | 346 | 347 | --------------------------------------------------------------------------------